def test_lgcn(model, cfg, logger): for k, v in cfg.model['kwargs'].items(): setattr(cfg.test_data, k, v) dataset = build_dataset(cfg.test_data) ofn_pred = os.path.join(cfg.work_dir, 'pred_edges_scores.npz') if os.path.isfile(ofn_pred) and not cfg.force: data = np.load(ofn_pred) edges = data['edges'] scores = data['scores'] inst_num = data['inst_num'] if inst_num != len(dataset): logger.warn( 'instance number in {} is different from dataset: {} vs {}'. format(ofn_pred, inst_num, len(dataset))) else: edges, scores, inst_num = test(model, dataset, cfg, logger) # produce predicted labels clusters = graph_clustering_dynamic_th(edges, scores, max_sz=cfg.max_sz, step=cfg.step, pool=cfg.pool) pred_idx2lb = clusters2labels(clusters) pred_labels = intdict2ndarray(pred_idx2lb) if cfg.save_output: print('save predicted edges and scores to {}'.format(ofn_pred)) np.savez_compressed(ofn_pred, edges=edges, scores=scores, inst_num=inst_num) ofn_meta = os.path.join(cfg.work_dir, 'pred_labels.txt') write_meta(ofn_meta, pred_idx2lb, inst_num=inst_num) # evaluation if not dataset.ignore_label: print('==> evaluation') gt_labels = dataset.labels for metric in cfg.metrics: evaluate(gt_labels, pred_labels, metric) single_cluster_idxs = get_cluster_idxs(clusters, size=1) print('==> evaluation (removing {} single clusters)'.format( len(single_cluster_idxs))) remain_idxs = np.setdiff1d(np.arange(len(dataset)), np.array(single_cluster_idxs)) remain_idxs = np.array(remain_idxs) for metric in cfg.metrics: evaluate(gt_labels[remain_idxs], pred_labels[remain_idxs], metric)
def generate_basic_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, minsz=3, maxsz=300, is_rebuild=False, is_save_proposals=True, force=False, **kwargs): print('k={}, th_knn={}, th_step={}, maxsz={}, is_rebuild={}'.format( k, th_knn, th_step, maxsz, is_rebuild)) # build knns knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = osp.join( oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.format( knn_method, k, th_knn, th_step, minsz, maxsz)) ofn_pred_labels = osp.join(ofolder, 'pred_labels.txt') if not osp.exists(ofolder): os.makedirs(ofolder) if not osp.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices'): clusters = super_vertex(knns, k, th_knn, th_step, maxsz) with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, minsz) # output cluster proposals ofolder_proposals = osp.join(ofolder, 'proposals') if is_save_proposals: print('saving cluster proposals to {}'.format(ofolder_proposals)) if not osp.exists(ofolder_proposals): os.makedirs(ofolder_proposals) save_proposals(clusters, knns, ofolder=ofolder_proposals, force=force) return ofolder_proposals, ofn_pred_labels
def deoverlap(scores, proposals, tot_inst_num, th_pos=-1, th_iou=1, pred_label_fn=None, outlier_scores=None, th_outlier=0.5, keep_outlier=False): print('avg_score(mean: {:.2f}, max: {:.2f}, min: {:.2f})'.format( scores.mean(), scores.max(), scores.min())) assert len(proposals) == len(scores), '{} vs {}'.format( len(proposals), len(scores)) assert (outlier_scores is None) or isinstance(outlier_scores, dict) pos_lst = [] for idx, prob in enumerate(scores): if prob < th_pos: continue pos_lst.append([idx, prob]) pos_lst = sorted(pos_lst, key=lambda x: x[1], reverse=True) # get all clusters clusters = [] if keep_outlier: o_clusters = [] for idx, _ in tqdm(pos_lst): fn_node = proposals[idx] cluster = load_data(fn_node) cluster, o_cluster = filter_outlier(cluster, fn_node, outlier_scores, th_outlier) clusters.append(cluster) if keep_outlier and len(o_cluster) > 0: o_clusters.append(o_cluster) if keep_outlier: print('#outlier_clusters: {}'.format(len(o_clusters))) clusters.extend(o_clusters) idx2lb, idx2lbs = nms(clusters, th_iou) # output stats multi_lb_num = 0 for _, lbs in idx2lbs.items(): if len(lbs) > 1: multi_lb_num += 1 inst_num = len(idx2lb) cls_num = len(set(idx2lb.values())) print('#inst: {}, #class: {}, #multi-label: {}'.format( inst_num, cls_num, multi_lb_num)) print('#inst-coverage: {:.2f}'.format(1. * inst_num / tot_inst_num)) # save to file pred_labels = write_meta(pred_label_fn, idx2lb, inst_num=tot_inst_num) return pred_labels
def generate_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, min_size=3, max_size=300, is_rebuild=False, is_save_proposals=False): print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\ format(k, th_knn, th_step, max_size, is_rebuild)) # build knns # each node and it's top k nearest nodes also distancess knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = os.path.join(oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.\ format(knn_method, k, th_knn, th_step, min_size, max_size)) ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt') if not os.path.exists(ofolder): os.makedirs(ofolder) if not os.path.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices'): clusters = super_vertex(knns, k, th_knn, th_step, max_size) with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, min_size) # output cluster proposals if is_save_proposals: ofolder = os.path.join(ofolder, 'proposals') print('saving cluster proposals to {}'.format(ofolder)) if not os.path.exists(ofolder): os.makedirs(ofolder) save_proposals(clusters, knns, ofolder=ofolder, force=True)
def deoverlap(probs, proposals, tot_inst_num, th_pos=-1, th_iou=1, pred_label_fn=None): print('avg_score(mean: {:.2f}, max: {:.2f}, min: {:.2f})'.format( probs.mean(), probs.max(), probs.min())) assert len(proposals) == len(probs), '{} vs {}'.format( len(proposals), len(probs)) pos_lst = [] for idx, prob in enumerate(probs): if prob < th_pos: continue pos_lst.append([idx, prob]) pos_lst = sorted(pos_lst, key=lambda x: x[1], reverse=True) # get all clusters clusters = [] for idx, _ in tqdm(pos_lst): cluster = load_data(proposals[idx]) clusters.append(cluster) idx2lb, idx2lbs = nms(clusters, th_iou) # output stats multi_lb_num = 0 for _, lbs in idx2lbs.items(): if len(lbs) > 1: multi_lb_num += 1 inst_num = len(idx2lb) cls_num = len(set(idx2lb.values())) print('#inst: {}, #class: {}, #multi-label: {}'.format( inst_num, cls_num, multi_lb_num)) print('#inst-coverage: {:.2f}'.format(1. * inst_num / tot_inst_num)) # save to file pred_labels = write_meta(pred_label_fn, idx2lb, inst_num=tot_inst_num) return pred_labels
def test_cluster_seg(model, cfg, logger): assert osp.isfile(cfg.pred_iou_score) if cfg.load_from: logger.info('load pretrained model from: {}'.format(cfg.load_from)) load_checkpoint(model, cfg.load_from, strict=True, logger=logger) for k, v in cfg.model['kwargs'].items(): setattr(cfg.test_data, k, v) setattr(cfg.test_data, 'pred_iop_score', cfg.pred_iop_score) dataset = build_dataset(cfg.test_data) processor = build_processor(cfg.stage) inst_num = dataset.inst_num # read pred_scores from file and do sanity check d = np.load(cfg.pred_iou_score, allow_pickle=True) pred_scores = d['data'] meta = d['meta'].item() assert inst_num == meta['tot_inst_num'], '{} vs {}'.format( inst_num, meta['tot_inst_num']) proposals = [fn_node for fn_node, _ in dataset.tot_lst] _proposals = [] fn_node_pattern = '*_node.npz' for proposal_folder in meta['proposal_folders']: fn_clusters = sorted( glob.glob(osp.join(proposal_folder, fn_node_pattern))) _proposals.extend([fn_node for fn_node in fn_clusters]) assert proposals == _proposals, '{} vs {}'.format(len(proposals), len(_proposals)) losses = [] pred_outlier_scores = [] stats = {'mean': []} if cfg.gpus == 1: data_loader = build_dataloader(dataset, processor, cfg.test_batch_size_per_gpu, cfg.workers_per_gpu, train=False) model = MMDataParallel(model, device_ids=range(cfg.gpus)) if cfg.cuda: model.cuda() model.eval() for i, data in enumerate(data_loader): with torch.no_grad(): output, loss = model(data, return_loss=True) losses += [loss.item()] if i % cfg.log_config.interval == 0: if dataset.ignore_label: logger.info('[Test] Iter {}/{}'.format( i, len(data_loader))) else: logger.info('[Test] Iter {}/{}: Loss {:.4f}'.format( i, len(data_loader), loss)) if cfg.save_output: output = F.softmax(output, dim=1) output = output[:, 1, :] scores = output.data.cpu().numpy() pred_outlier_scores.extend(list(scores)) stats['mean'] += [scores.mean()] else: raise NotImplementedError if not dataset.ignore_label: avg_loss = sum(losses) / len(losses) logger.info('[Test] Overall Loss {:.4f}'.format(avg_loss)) scores_mean = 1. * sum(stats['mean']) / len(stats['mean']) logger.info('mean of pred_outlier_scores: {:.4f}'.format(scores_mean)) # save predicted scores if cfg.save_output: if cfg.load_from: fn = osp.basename(cfg.load_from) else: fn = 'random' opath = osp.join(cfg.work_dir, fn[:fn.rfind('.pth')] + '.npz') meta = { 'tot_inst_num': inst_num, 'proposal_folders': cfg.test_data.proposal_folders, } logger.info('dump pred_outlier_scores ({}) to {}'.format( len(pred_outlier_scores), opath)) np.savez_compressed(opath, data=pred_outlier_scores, meta=meta) # post-process outlier_scores = { fn_node: outlier_score for (fn_node, _), outlier_score in zip(dataset.lst, pred_outlier_scores) } # de-overlap (w gcn-s) pred_labels_w_seg = deoverlap(pred_scores, proposals, inst_num, cfg.th_pos, cfg.th_iou, outlier_scores=outlier_scores, th_outlier=cfg.th_outlier, keep_outlier=cfg.keep_outlier) # de-overlap (wo gcn-s) pred_labels_wo_seg = deoverlap(pred_scores, proposals, inst_num, cfg.th_pos, cfg.th_iou) # save predicted labels if cfg.save_output: ofn_meta_w_seg = osp.join(cfg.work_dir, 'pred_labels_w_seg.txt') ofn_meta_wo_seg = osp.join(cfg.work_dir, 'pred_labels_wo_seg.txt') print('save predicted labels to {} and {}'.format( ofn_meta_w_seg, ofn_meta_wo_seg)) pred_idx2lb_w_seg = list2dict(pred_labels_w_seg, ignore_value=-1) pred_idx2lb_wo_seg = list2dict(pred_labels_wo_seg, ignore_value=-1) write_meta(ofn_meta_w_seg, pred_idx2lb_w_seg, inst_num=inst_num) write_meta(ofn_meta_wo_seg, pred_idx2lb_wo_seg, inst_num=inst_num) # evaluation if not dataset.ignore_label: gt_labels = dataset.labels print('==> evaluation (with gcn-s)') for metric in cfg.metrics: evaluate(gt_labels, pred_labels_w_seg, metric) print('==> evaluation (without gcn-s)') for metric in cfg.metrics: evaluate(gt_labels, pred_labels_wo_seg, metric)
def test_gcn_e(model, cfg, logger): for k, v in cfg.model['kwargs'].items(): setattr(cfg.test_data, k, v) dataset = build_dataset(cfg.model['type'], cfg.test_data) pred_peaks = dataset.peaks pred_dist2peak = dataset.dist2peak ofn_pred = osp.join(cfg.work_dir, 'pred_conns.npz') if osp.isfile(ofn_pred) and not cfg.force: data = np.load(ofn_pred) pred_conns = data['pred_conns'] inst_num = data['inst_num'] if inst_num != dataset.inst_num: logger.warn( 'instance number in {} is different from dataset: {} vs {}'. format(ofn_pred, inst_num, len(dataset))) else: if cfg.random_conns: pred_conns = [] for nbr, dist, idx in zip(dataset.subset_nbrs, dataset.subset_dists, dataset.subset_idxs): for _ in range(cfg.max_conn): pred_rel_nbr = np.random.choice(np.arange(len(nbr))) pred_abs_nbr = nbr[pred_rel_nbr] pred_peaks[idx].append(pred_abs_nbr) pred_dist2peak[idx].append(dist[pred_rel_nbr]) pred_conns.append(pred_rel_nbr) pred_conns = np.array(pred_conns) else: pred_conns = test(model, dataset, cfg, logger) for pred_rel_nbr, nbr, dist, idx in zip(pred_conns, dataset.subset_nbrs, dataset.subset_dists, dataset.subset_idxs): pred_abs_nbr = nbr[pred_rel_nbr] pred_peaks[idx].extend(pred_abs_nbr) pred_dist2peak[idx].extend(dist[pred_rel_nbr]) inst_num = dataset.inst_num if len(pred_conns) > 0: logger.info( 'pred_conns (nbr order): mean({:.1f}), max({}), min({})'.format( pred_conns.mean(), pred_conns.max(), pred_conns.min())) if not dataset.ignore_label and cfg.eval_interim: subset_gt_labels = dataset.subset_gt_labels for i in range(cfg.max_conn): pred_peaks_labels = np.array([ dataset.idx2lb[pred_peaks[idx][i]] for idx in dataset.subset_idxs ]) acc = accuracy(pred_peaks_labels, subset_gt_labels) logger.info( '[{}-th] accuracy of pred_peaks labels ({}): {:.4f}'.format( i, len(pred_peaks_labels), acc)) # the rule for nearest nbr is only appropriate when nbrs is sorted nearest_idxs = np.where(pred_conns[:, i] == 0)[0] acc = accuracy(pred_peaks_labels[nearest_idxs], subset_gt_labels[nearest_idxs]) logger.info( '[{}-th] accuracy of pred labels (nearest: {}): {:.4f}'.format( i, len(nearest_idxs), acc)) not_nearest_idxs = np.where(pred_conns[:, i] > 0)[0] acc = accuracy(pred_peaks_labels[not_nearest_idxs], subset_gt_labels[not_nearest_idxs]) logger.info( '[{}-th] accuracy of pred labels (not nearest: {}): {:.4f}'. format(i, len(not_nearest_idxs), acc)) with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau)): pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau, inst_num) if cfg.save_output: logger.info( 'save predicted connectivity and labels to {}'.format(ofn_pred)) if not osp.isfile(ofn_pred) or cfg.force: np.savez_compressed(ofn_pred, pred_conns=pred_conns, inst_num=inst_num) # save clustering results idx2lb = list2dict(pred_labels, ignore_value=-1) folder = '{}_gcne_k_{}_th_{}_ig_{}'.format(cfg.test_name, cfg.knn, cfg.th_sim, cfg.test_data.ignore_ratio) opath_pred_labels = osp.join(cfg.work_dir, folder, 'tau_{}_pred_labels.txt'.format(cfg.tau)) mkdir_if_no_exists(opath_pred_labels) write_meta(opath_pred_labels, idx2lb, inst_num=inst_num) # evaluation if not dataset.ignore_label: print('==> evaluation') for metric in cfg.metrics: evaluate(dataset.gt_labels, pred_labels, metric) # H and C-scores gt_dict = {} pred_dict = {} for i in range(len(dataset.gt_labels)): gt_dict[str(i)] = dataset.gt_labels[i] pred_dict[str(i)] = pred_labels[i] bm = ClusteringBenchmark(gt_dict) scores = bm.evaluate_vmeasure(pred_dict) # fmi_scores = bm.evaluate_fowlkes_mallows_score(pred_dict) print(scores)
return opath if __name__ == '__main__': args = parse_args() cluster_func = baseline.__dict__[args.method] ds = BasicDataset(name=args.name, prefix=args.prefix, dim=args.dim, normalize=not args.no_normalize) ds.info() feats = ds.features opath = get_output_path(args) with Timer('{}'.format(args.method)): pred_labels = cluster_func(feats, **args.__dict__) # save clustering results idx2lb = {} for idx, lb in enumerate(pred_labels): if lb == -1: continue idx2lb[idx] = lb inst_num = len(pred_labels) print('coverage: {} / {} = {:.4f}'.format(len(idx2lb), inst_num, 1. * len(idx2lb) / inst_num)) write_meta(opath, idx2lb, inst_num=inst_num)
idxs = lb2idxs[lb] iou = compute_iou(cluster, idxs) ious.append(iou) ious = np.array(ious) # rank by iou pos_g_labels = np.where(ious > args.th_pos)[0] clusters = [[clusters[i], ious[i]] for i in pos_g_labels] clusters = sorted(clusters, key=lambda x: x[1], reverse=True) clusters = [n for n, _ in clusters] inst_num = len(idx2lb) pos_idx_set = set() for c in clusters: pos_idx_set |= set(c) print('inst-coverage before nms: {}'.format(1. * len(pos_idx_set) / inst_num)) # nms idx2lb, _ = nms(clusters, args.th_iou) # output stats inst_num = len(idx2lb) cls_num = len(idx2lb.values()) print('#inst: {}, #class: {}'.format(inst_num, cls_num)) print('#inst-coverage: {:.2f}'.format(1. * inst_num / tot_inst_num)) # save to file write_meta(pred_label_fn, idx2lb, inst_num=tot_inst_num)
def test_cluster_det(model, cfg, logger): if cfg.load_from: logger.info('load pretrained model from: {}'.format(cfg.load_from)) load_checkpoint(model, cfg.load_from, strict=True, logger=logger) for k, v in cfg.model['kwargs'].items(): setattr(cfg.test_data, k, v) dataset = build_dataset(cfg.test_data) processor = build_processor(cfg.stage) losses = [] pred_scores = [] if cfg.gpus == 1: data_loader = build_dataloader(dataset, processor, cfg.test_batch_size_per_gpu, cfg.workers_per_gpu, train=False) model = MMDataParallel(model, device_ids=range(cfg.gpus)) if cfg.cuda: model.cuda() model.eval() for i, data in enumerate(data_loader): with torch.no_grad(): output, loss = model(data, return_loss=True) losses += [loss.item()] if i % cfg.log_config.interval == 0: if dataset.ignore_label: logger.info('[Test] Iter {}/{}'.format( i, len(data_loader))) else: logger.info('[Test] Iter {}/{}: Loss {:.4f}'.format( i, len(data_loader), loss)) if cfg.save_output: output = output.view(-1) prob = output.data.cpu().numpy() pred_scores.append(prob) else: raise NotImplementedError if not dataset.ignore_label: avg_loss = sum(losses) / len(losses) logger.info('[Test] Overall Loss {:.4f}'.format(avg_loss)) # save predicted scores if cfg.save_output: if cfg.load_from: fn = os.path.basename(cfg.load_from) else: fn = 'random' opath = os.path.join(cfg.work_dir, fn[:fn.rfind('.pth')] + '.npz') meta = { 'tot_inst_num': dataset.inst_num, 'proposal_folders': cfg.test_data.proposal_folders, } print('dump pred_score to {}'.format(opath)) pred_scores = np.concatenate(pred_scores).ravel() np.savez_compressed(opath, data=pred_scores, meta=meta) # de-overlap proposals = [fn_node for fn_node, _ in dataset.lst] pred_labels = deoverlap(pred_scores, proposals, dataset.inst_num, cfg.th_pos, cfg.th_iou) # save predicted labels if cfg.save_output: ofn_meta = os.path.join(cfg.work_dir, 'pred_labels.txt') print('save predicted labels to {}'.format(ofn_meta)) pred_idx2lb = list2dict(pred_labels, ignore_value=-1) write_meta(ofn_meta, pred_idx2lb, inst_num=dataset.inst_num) # evaluation if not dataset.ignore_label: print('==> evaluation') gt_labels = dataset.labels for metric in cfg.metrics: evaluate(gt_labels, pred_labels, metric)
def generate_iter_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, minsz=3, maxsz=300, sv_minsz=2, sv_maxsz=5, sv_labels=None, sv_knn_prefix=None, is_rebuild=False, is_save_proposals=True, force=False, **kwargs): assert sv_minsz >= 2, "sv_minsz >= 2 to avoid duplicated proposals" print('k={}, th_knn={}, th_step={}, minsz={}, maxsz={}, ' 'sv_minsz={}, sv_maxsz={}, is_rebuild={}'.format( k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz, is_rebuild)) if not os.path.exists(sv_labels): raise FileNotFoundError('{} not found.'.format(sv_labels)) if sv_knn_prefix is None: sv_knn_prefix = knn_prefix # get iter and knns from super vertex path _iter = get_iter_from_path(sv_labels) + 1 knns_inst = get_knns_from_path(sv_labels, sv_knn_prefix, feats) print('read sv_clusters from {}'.format(sv_labels)) sv_lb2idxs, sv_idx2lb = read_meta(sv_labels) inst_num = len(sv_idx2lb) sv_clusters = labels2clusters(sv_lb2idxs) # sv_clusters = filter_clusters(sv_clusters, minsz) feats = np.array([feats[c, :].mean(axis=0) for c in sv_clusters]) print('average feature of super vertices:', feats.shape) # build knns knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = os.path.join( oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_sv_minsz_{}_maxsz_{}_iter_{}'. format(knn_method, k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz, _iter)) ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt') if not os.path.exists(ofolder): os.makedirs(ofolder) if not os.path.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices (iter={})'.format(_iter)): clusters = super_vertex(knns, k, th_knn, th_step, sv_maxsz) clusters = filter_clusters(clusters, sv_minsz) clusters = [[x for c in cluster for x in sv_clusters[c]] for cluster in clusters] with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels, inst_num=inst_num) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, minsz, maxsz) # output cluster proposals ofolder_proposals = os.path.join(ofolder, 'proposals') if is_save_proposals: print('saving cluster proposals to {}'.format(ofolder_proposals)) if not os.path.exists(ofolder_proposals): os.makedirs(ofolder_proposals) save_proposals(clusters, knns_inst, ofolder=ofolder_proposals, force=force) return ofolder_proposals, ofn_pred_labels
def test_gcn_v(model, cfg, logger): for k, v in cfg.model['kwargs'].items(): setattr(cfg.test_data, k, v) dataset = build_dataset(cfg.model['type'], cfg.test_data) folder = '{}_gcnv_k_{}_th_{}'.format(cfg.test_name, cfg.knn, cfg.th_sim) oprefix = osp.join(cfg.work_dir, folder) oname = osp.basename(rm_suffix(cfg.load_from)) opath_pred_confs = osp.join(oprefix, 'pred_confs', '{}.npz'.format(oname)) if osp.isfile(opath_pred_confs) and not cfg.force: data = np.load(opath_pred_confs) pred_confs = data['pred_confs'] inst_num = data['inst_num'] if inst_num != dataset.inst_num: logger.warn( 'instance number in {} is different from dataset: {} vs {}'. format(opath_pred_confs, inst_num, len(dataset))) else: pred_confs, gcn_feat = test(model, dataset, cfg, logger) inst_num = dataset.inst_num logger.info('pred_confs: mean({:.4f}). max({:.4f}), min({:.4f})'.format( pred_confs.mean(), pred_confs.max(), pred_confs.min())) logger.info('Convert to cluster') with Timer('Predition to peaks'): pred_dist2peak, pred_peaks = confidence_to_peaks( dataset.dists, dataset.nbrs, pred_confs, cfg.max_conn) if not dataset.ignore_label and cfg.eval_interim: # evaluate the intermediate results for i in range(cfg.max_conn): num = len(dataset.peaks) pred_peaks_i = np.arange(num) peaks_i = np.arange(num) for j in range(num): if len(pred_peaks[j]) > i: pred_peaks_i[j] = pred_peaks[j][i] if len(dataset.peaks[j]) > i: peaks_i[j] = dataset.peaks[j][i] acc = accuracy(pred_peaks_i, peaks_i) logger.info('[{}-th conn] accuracy of peak match: {:.4f}'.format( i + 1, acc)) acc = 0. for idx, peak in enumerate(pred_peaks_i): acc += int(dataset.idx2lb[peak] == dataset.idx2lb[idx]) acc /= len(pred_peaks_i) logger.info( '[{}-th conn] accuracy of peak label match: {:.4f}'.format( i + 1, acc)) with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau_0)): pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau_0, inst_num) if cfg.save_output: logger.info('save predicted confs to {}'.format(opath_pred_confs)) mkdir_if_no_exists(opath_pred_confs) np.savez_compressed(opath_pred_confs, pred_confs=pred_confs, inst_num=inst_num) # save clustering results idx2lb = list2dict(pred_labels, ignore_value=-1) opath_pred_labels = osp.join( cfg.work_dir, folder, 'tau_{}_pred_labels.txt'.format(cfg.tau_0)) logger.info('save predicted labels to {}'.format(opath_pred_labels)) mkdir_if_no_exists(opath_pred_labels) write_meta(opath_pred_labels, idx2lb, inst_num=inst_num) # evaluation if not dataset.ignore_label: print('==> evaluation') for metric in cfg.metrics: evaluate(dataset.gt_labels, pred_labels, metric) if cfg.use_gcn_feat: # gcn_feat is saved to disk for GCN-E opath_feat = osp.join(oprefix, 'features', '{}.bin'.format(oname)) if not osp.isfile(opath_feat) or cfg.force: mkdir_if_no_exists(opath_feat) write_feat(opath_feat, gcn_feat) name = rm_suffix(osp.basename(opath_feat)) prefix = oprefix ds = BasicDataset(name=name, prefix=prefix, dim=cfg.model['kwargs']['nhid'], normalize=True) ds.info() # use top embedding of GCN to rebuild the kNN graph with Timer('connect to higher confidence with use_gcn_feat'): knn_prefix = osp.join(prefix, 'knns', name) knns = build_knns(knn_prefix, ds.features, cfg.knn_method, cfg.knn, is_rebuild=True) dists, nbrs = knns2ordered_nbrs(knns) pred_dist2peak, pred_peaks = confidence_to_peaks( dists, nbrs, pred_confs, cfg.max_conn) pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau, inst_num) # save clustering results if cfg.save_output: oname_meta = '{}_gcn_feat'.format(name) opath_pred_labels = osp.join( oprefix, oname_meta, 'tau_{}_pred_labels.txt'.format(cfg.tau)) mkdir_if_no_exists(opath_pred_labels) idx2lb = list2dict(pred_labels, ignore_value=-1) write_meta(opath_pred_labels, idx2lb, inst_num=inst_num) # evaluation if not dataset.ignore_label: print('==> evaluation') for metric in cfg.metrics: evaluate(dataset.gt_labels, pred_labels, metric) import json import os import pdb pdb.set_trace() img_labels = json.load( open(r'/home/finn/research/data/clustering_data/test_index.json', 'r', encoding='utf-8')) import shutil output = r'/home/finn/research/data/clustering_data/mr_gcn_output' for label in set(pred_labels): if not os.path.exists(os.path.join(output, f'cluter_{label}')): os.mkdir(os.path.join(output, f'cluter_{label}')) for image in img_labels: shutil.copy2( image, os.path.join( os.path.join(output, f'cluter_{pred_labels[img_labels[image]]}'), os.path.split(image)[-1]))
def generate_proposals(oprefix, feats, feat_dim=256, knn_method='hnsw', k=80, th_knn=0.6, th_step=0.05, min_size=3, max_size=300, is_rebuild=False, is_save_proposals=False): print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\ format(k, th_knn, th_step, max_size, is_rebuild)) ## knn retrieval oprefix = os.path.join(oprefix, '{}_k_{}'.format(knn_method, k)) knn_fn = oprefix + '.npz' if not os.path.isfile(knn_fn) or is_rebuild: index_fn = oprefix + '.index' with Timer('build index'): if knn_method == 'hnsw': from proposals import knn_hnsw index = knn_hnsw(feats, k, index_fn) elif knn_method == 'faiss': from proposals import knn_faiss index = knn_faiss(feats, k, index_fn) else: raise KeyError('Unsupported method({}). \ Only support hnsw and faiss currently'.format( knn_method)) knns = index.get_knns() with Timer('dump knns to {}'.format(knn_fn)): dump_data(knn_fn, knns, force=True) else: print('read knn from {}'.format(knn_fn)) knns = load_data(knn_fn) # obtain cluster proposals ofolder = oprefix + '_th_{}_step_{}_minsz_{}_maxsz_{}_iter0'.\ format(th_knn, th_step, min_size, max_size) ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt') if not os.path.exists(ofolder): os.makedirs(ofolder) if not os.path.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices'): clusters = super_vertex(knns, k, th_knn, th_step, max_size) with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, min_size) # output cluster proposals if is_save_proposals: ofolder = os.path.join(ofolder, 'proposals') print('saving cluster proposals to {}'.format(ofolder)) if not os.path.exists(ofolder): os.makedirs(ofolder) save_proposals(clusters, knns, ofolder=ofolder, force=True)