Exemplo n.º 1
0
Arquivo: dataset.py Projeto: yuk12/dgl
    def __init__(self, features, labels, cluster_features=None, k=10, levels=1, faiss_gpu=False):
        self.k = k
        self.gs = []
        self.nbrs = []
        self.dists = []
        self.levels = levels

        # Initialize features and labels
        features = l2norm(features.astype('float32'))
        global_features = features.copy()
        if cluster_features is None:
            cluster_features = features
        global_num_nodes = features.shape[0]
        global_edges = ([], [])
        global_peaks = np.array([], dtype=np.long)
        ids = np.arange(global_num_nodes)

        # Recursive graph construction
        for lvl in range(self.levels):
            if features.shape[0] <= self.k:
                self.levels = lvl
                break
            if faiss_gpu:
                knns = build_knns(features, self.k, 'faiss_gpu')
            else:
                knns = build_knns(features, self.k, 'faiss')
            dists, nbrs = knns2ordered_nbrs(knns)
            self.nbrs.append(nbrs)
            self.dists.append(dists)
            density = density_estimation(dists, nbrs, labels)

            g = self._build_graph(features, cluster_features, labels, density, knns)
            self.gs.append(g)

            if lvl >= self.levels - 1:
                break

            # Decode peak nodes
            new_pred_labels, peaks,\
                global_edges, global_pred_labels, global_peaks = decode(g, 0, 'sim', True,
                                                                        ids, global_edges, global_num_nodes,
                                                                        global_peaks)
            ids = ids[peaks]
            features, labels, cluster_features = build_next_level(features, labels, peaks,
                                                                  global_features, global_pred_labels, global_peaks)
Exemplo n.º 2
0
def knn_dbscan(feats, eps, min_samples, prefix, name, knn_method, knn, th_sim,
               **kwargs):
    knn_prefix = os.path.join(prefix, 'knns', name)
    knns = build_knns(knn_prefix, feats, knn_method, knn)
    sparse_affinity = knns2spmat(knns, knn, th_sim)
    db = cluster.DBSCAN(eps=eps,
                        min_samples=min_samples,
                        n_jobs=mp.cpu_count(),
                        metric='precomputed').fit(sparse_affinity)
    return db.labels_
def generate_basic_proposals(oprefix,
                             knn_prefix,
                             feats,
                             feat_dim=256,
                             knn_method='faiss',
                             k=80,
                             th_knn=0.6,
                             th_step=0.05,
                             minsz=3,
                             maxsz=300,
                             is_rebuild=False,
                             is_save_proposals=True,
                             force=False,
                             **kwargs):
    print('k={}, th_knn={}, th_step={}, maxsz={}, is_rebuild={}'.format(
        k, th_knn, th_step, maxsz, is_rebuild))

    # build knns
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = osp.join(
        oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.format(
            knn_method, k, th_knn, th_step, minsz, maxsz))
    ofn_pred_labels = osp.join(ofolder, 'pred_labels.txt')
    if not osp.exists(ofolder):
        os.makedirs(ofolder)
    if not osp.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices'):
            clusters = super_vertex(knns, k, th_knn, th_step, maxsz)
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, minsz)

    # output cluster proposals
    ofolder_proposals = osp.join(ofolder, 'proposals')
    if is_save_proposals:
        print('saving cluster proposals to {}'.format(ofolder_proposals))
        if not osp.exists(ofolder_proposals):
            os.makedirs(ofolder_proposals)
        save_proposals(clusters, knns, ofolder=ofolder_proposals, force=force)

    return ofolder_proposals, ofn_pred_labels
Exemplo n.º 4
0
def chinese_whispers_fast(feats, prefix, name, knn_method, knn, th_sim, iters,
                          **kwargs):
    """ Chinese Whispers Clustering Algorithm

    Paper: Chinese whispers: an efficient graph clustering algorithm
            and its application to natural language processing problems.
    This implementation follows the matrix operation as described in Figure.4
    int the paper. We switch the `maxrow` and `D^{t-1} * A_G` to make it
    easier for post-processing.
    The current result is inferior to `chinese_whispers` as it lacks of the
    random mechanism as the iterative algorithm. The paper introduce two
    operations to tackle this issue, namely `random mutation` and `keep class`.
    However, it is not very clear how to set this two hyper-parameters.
    """
    assert len(feats) > 1

    with Timer('create graph'):
        knn_prefix = os.path.join(prefix, 'knns', name)
        knns = build_knns(knn_prefix, feats, knn_method, knn)
        dists, nbrs = knns2ordered_nbrs(knns, sort=True)
        spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True)
        A = build_symmetric_adj(spmat, self_loop=False)

        node_num = len(feats)
        edge_num = A.nnz
        print('#nodes: {}, #edges: {}'.format(node_num, edge_num))

    with Timer('whisper iteratively (iters={})'.format(iters)):
        D = identity(node_num)
        for _ in range(iters):
            D = D * A  # it is equal to D.dot(A)
            D = _maxrow(D, node_num)

        assert D.nnz == node_num

    clusters = {}
    assigned_clusters = D.tocoo().col
    for (node, assigned_cluster) in enumerate(assigned_clusters):
        if assigned_cluster not in clusters:
            clusters[assigned_cluster] = []
        clusters[assigned_cluster].append(node)

    print('#cluster: {}'.format(len(clusters)))
    labels = clusters2labels(clusters.values())
    labels = list(labels.values())

    return labels
def generate_proposals(oprefix,
                       knn_prefix,
                       feats,
                       feat_dim=256,
                       knn_method='faiss',
                       k=80,
                       th_knn=0.6,
                       th_step=0.05,
                       min_size=3,
                       max_size=300,
                       is_rebuild=False,
                       is_save_proposals=False):
    print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\
            format(k, th_knn, th_step, max_size, is_rebuild))

    # build knns
    # each node and it's top k nearest nodes also distancess
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = os.path.join(oprefix,
            '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.\
            format(knn_method, k, th_knn, th_step, min_size, max_size))
    ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt')
    if not os.path.exists(ofolder):
        os.makedirs(ofolder)
    if not os.path.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices'):
            clusters = super_vertex(knns, k, th_knn, th_step, max_size)
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, min_size)

    # output cluster proposals
    if is_save_proposals:
        ofolder = os.path.join(ofolder, 'proposals')
        print('saving cluster proposals to {}'.format(ofolder))
        if not os.path.exists(ofolder):
            os.makedirs(ofolder)
        save_proposals(clusters, knns, ofolder=ofolder, force=True)
Exemplo n.º 6
0
    def __init__(self, cfg):
        feat_path = cfg['feat_path']
        label_path = cfg.get('label_path', None)
        knn_graph_path = cfg.get('knn_graph_path', None)

        self.k = cfg['k']
        self.feature_dim = cfg['feature_dim']
        self.is_norm_feat = cfg.get('is_norm_feat', True)

        self.th_sim = cfg.get('th_sim', 0.)
        self.max_conn = cfg.get('max_conn', 1)

        self.ignore_ratio = cfg.get('ignore_ratio', 0.8)
        self.ignore_small_confs = cfg.get('ignore_small_confs', True)
        self.use_candidate_set = cfg.get('use_candidate_set', True)

        self.nproc = cfg.get('nproc', 1)
        self.max_qsize = cfg.get('max_qsize', int(1e5))

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.inst_num = len(self.idx2lb)
                self.gt_labels = intdict2ndarray(self.idx2lb)
                self.ignore_label = False
            else:
                self.inst_num = -1
                self.ignore_label = True
            self.features = read_probs(feat_path, self.inst_num,
                                       self.feature_dim)
            if self.is_norm_feat:
                self.features = l2norm(self.features)
            if self.inst_num == -1:
                self.inst_num = self.features.shape[0]
            self.size = self.inst_num
            assert self.size == self.features.shape[0]

        print('feature shape: {}, k: {}, norm_feat: {}'.format(
            self.features.shape, self.k, self.is_norm_feat))

        with Timer('read knn graph'):
            if knn_graph_path is not None:
                knns = np.load(knn_graph_path)['data']
            else:
                prefix = osp.dirname(feat_path)
                name = rm_suffix(osp.basename(feat_path))
                # find root folder of `features`
                prefix = osp.dirname(prefix)
                knn_prefix = osp.join(prefix, 'knns', name)
                knns = build_knns(knn_prefix, self.features, cfg.knn_method,
                                  cfg.knn)
            assert self.inst_num == len(knns), "{} vs {}".format(
                self.inst_num, len(knns))

            adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True)

            # build symmetric adjacency matrix
            adj = build_symmetric_adj(adj, self_loop=True)
            self.adj = row_normalize(adj)

            # convert knns to (dists, nbrs)
            self.dists, self.nbrs = knns2ordered_nbrs(knns, sort=True)

            if cfg.pred_confs != '':
                print('read estimated confidence from {}'.format(
                    cfg.pred_confs))
                self.confs = np.load(cfg.pred_confs)['pred_confs']
            else:
                print('use unsupervised density as confidence')
                assert self.radius
                from vegcn.confidence import density
                self.confs = density(self.dists, radius=self.radius)

            assert 0 <= self.ignore_ratio <= 1
            if self.ignore_ratio == 1:
                self.ignore_set = set(np.arange(len(self.confs)))
            else:
                num = int(len(self.confs) * self.ignore_ratio)
                confs = self.confs
                if not self.ignore_small_confs:
                    confs = -confs
                self.ignore_set = set(np.argpartition(confs, num)[:num])

        print(
            'ignore_ratio: {}, ignore_small_confs: {}, use_candidate_set: {}'.
            format(self.ignore_ratio, self.ignore_small_confs,
                   self.use_candidate_set))
        print('#ignore_set: {} / {} = {:.3f}'.format(
            len(self.ignore_set), self.inst_num,
            1. * len(self.ignore_set) / self.inst_num))

        with Timer('Prepare sub-graphs'):
            # construct subgraphs with larger confidence
            self.peaks = {i: [] for i in range(self.inst_num)}
            self.dist2peak = {i: [] for i in range(self.inst_num)}

            if self.nproc > 1:
                # multi-process
                import multiprocessing as mp
                pool = mp.Pool(self.nproc)
                results = []
                num = int(self.inst_num / self.max_qsize) + 1
                for i in tqdm(range(num)):
                    beg = int(i * self.max_qsize)
                    end = min(beg + self.max_qsize, self.inst_num)
                    lst = [j for j in range(beg, end)]
                    results.extend(
                        list(
                            tqdm(pool.map(self.get_subgraph, lst),
                                 total=len(lst))))
                pool.close()
                pool.join()
            else:
                results = [
                    self.get_subgraph(i) for i in tqdm(range(self.inst_num))
                ]

            self.adj_lst = []
            self.feat_lst = []
            self.lb_lst = []
            self.subset_gt_labels = []
            self.subset_idxs = []
            self.subset_nbrs = []
            self.subset_dists = []
            for result in results:
                if result is None:
                    continue
                elif len(result) == 3:
                    i, nbr, dist = result
                    self.peaks[i].extend(nbr)
                    self.dist2peak[i].extend(dist)
                    continue
                i, nbr, dist, feat, adj, lb = result
                self.subset_idxs.append(i)
                self.subset_nbrs.append(nbr)
                self.subset_dists.append(dist)
                self.feat_lst.append(feat)
                self.adj_lst.append(adj)
                if not self.ignore_label:
                    self.subset_gt_labels.append(self.idx2lb[i])
                    self.lb_lst.append(lb)
            self.subset_gt_labels = np.array(self.subset_gt_labels)

            self.size = len(self.feat_lst)
            assert self.size == len(self.adj_lst)
            if not self.ignore_label:
                assert self.size == len(self.lb_lst)
Exemplo n.º 7
0
    def __init__(self, cfg):
        feat_path = cfg['feat_path']
        label_path = cfg.get('label_path', None)
        knn_graph_path = cfg.get('knn_graph_path', None)

        self.k = cfg['k']
        self.feature_dim = cfg['feature_dim']
        self.is_norm_feat = cfg.get('is_norm_feat', True)
        self.save_decomposed_adj = cfg.get('save_decomposed_adj', False)

        self.th_sim = cfg.get('th_sim', 0.)
        self.max_conn = cfg.get('max_conn', 1)
        self.conf_metric = cfg.get('conf_metric')

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.inst_num = len(self.idx2lb)
                self.gt_labels = intdict2ndarray(self.idx2lb)
                self.ignore_label = False
            else:
                self.inst_num = -1
                self.ignore_label = True
            self.features = read_probs(feat_path, self.inst_num,
                                       self.feature_dim)
            if self.is_norm_feat:
                self.features = l2norm(self.features)
            if self.inst_num == -1:
                self.inst_num = self.features.shape[0]
            self.size = 1 # take the entire graph as input

        with Timer('read knn graph'):
            if os.path.isfile(knn_graph_path):
                knns = np.load(knn_graph_path)['data']
            else:
                if knn_graph_path is not None:
                    print('knn_graph_path does not exist: {}'.format(
                        knn_graph_path))
                
                prefix = osp.dirname(feat_path)
                name = rm_suffix(osp.basename(feat_path))
                # find root folder of `features`
                prefix = osp.dirname(prefix)
                knn_prefix = osp.join(prefix, 'knns', name)
                knns = build_knns(knn_prefix, self.features, cfg.knn_method,
                                  cfg.knn)

            adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True)

            # build symmetric adjacency matrix
            adj = build_symmetric_adj(adj, self_loop=True)
            adj = row_normalize(adj)
            if self.save_decomposed_adj:
                adj = sparse_mx_to_indices_values(adj)
                self.adj_indices, self.adj_values, self.adj_shape = adj
            else:
                self.adj = adj

            # convert knns to (dists, nbrs)
            self.dists, self.nbrs = knns2ordered_nbrs(knns)

        print('feature shape: {}, k: {}, norm_feat: {}'.format(
            self.features.shape, self.k, self.is_norm_feat))

        if not self.ignore_label:
            with Timer('Prepare ground-truth label'):
                self.labels = confidence(feats=self.features,
                                         dists=self.dists,
                                         nbrs=self.nbrs,
                                         metric=self.conf_metric,
                                         idx2lb=self.idx2lb,
                                         lb2idxs=self.lb2idxs)
                if cfg.eval_interim:
                    _, self.peaks = confidence_to_peaks(
                        self.dists, self.nbrs, self.labels, self.max_conn)
Exemplo n.º 8
0
def generate_iter_proposals(oprefix,
                            knn_prefix,
                            feats,
                            feat_dim=256,
                            knn_method='faiss',
                            k=80,
                            th_knn=0.6,
                            th_step=0.05,
                            minsz=3,
                            maxsz=300,
                            sv_minsz=2,
                            sv_maxsz=5,
                            sv_labels=None,
                            sv_knn_prefix=None,
                            is_rebuild=False,
                            is_save_proposals=True,
                            force=False,
                            **kwargs):

    assert sv_minsz >= 2, "sv_minsz >= 2 to avoid duplicated proposals"
    print('k={}, th_knn={}, th_step={}, minsz={}, maxsz={}, '
          'sv_minsz={}, sv_maxsz={}, is_rebuild={}'.format(
              k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz,
              is_rebuild))

    if not os.path.exists(sv_labels):
        raise FileNotFoundError('{} not found.'.format(sv_labels))

    if sv_knn_prefix is None:
        sv_knn_prefix = knn_prefix

    # get iter and knns from super vertex path
    _iter = get_iter_from_path(sv_labels) + 1
    knns_inst = get_knns_from_path(sv_labels, sv_knn_prefix, feats)
    print('read sv_clusters from {}'.format(sv_labels))
    sv_lb2idxs, sv_idx2lb = read_meta(sv_labels)
    inst_num = len(sv_idx2lb)
    sv_clusters = labels2clusters(sv_lb2idxs)
    # sv_clusters = filter_clusters(sv_clusters, minsz)
    feats = np.array([feats[c, :].mean(axis=0) for c in sv_clusters])
    print('average feature of super vertices:', feats.shape)

    # build knns
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = os.path.join(
        oprefix,
        '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_sv_minsz_{}_maxsz_{}_iter_{}'.
        format(knn_method, k, th_knn, th_step, minsz, maxsz, sv_minsz,
               sv_maxsz, _iter))
    ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt')
    if not os.path.exists(ofolder):
        os.makedirs(ofolder)
    if not os.path.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices (iter={})'.format(_iter)):
            clusters = super_vertex(knns, k, th_knn, th_step, sv_maxsz)
            clusters = filter_clusters(clusters, sv_minsz)
            clusters = [[x for c in cluster for x in sv_clusters[c]]
                        for cluster in clusters]
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels, inst_num=inst_num)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, minsz, maxsz)

    # output cluster proposals
    ofolder_proposals = os.path.join(ofolder, 'proposals')
    if is_save_proposals:
        print('saving cluster proposals to {}'.format(ofolder_proposals))
        if not os.path.exists(ofolder_proposals):
            os.makedirs(ofolder_proposals)
        save_proposals(clusters,
                       knns_inst,
                       ofolder=ofolder_proposals,
                       force=force)

    return ofolder_proposals, ofn_pred_labels
Exemplo n.º 9
0
def get_knns_from_path(s, knn_prefix, feats):
    dic = parse_path(s)
    k = int(dic['k'])
    knn_method = dic['knn_method']
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild=False)
    return knns
Exemplo n.º 10
0
def test_gcn_v(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.model['type'], cfg.test_data)

    folder = '{}_gcnv_k_{}_th_{}'.format(cfg.test_name, cfg.knn, cfg.th_sim)
    oprefix = osp.join(cfg.work_dir, folder)
    oname = osp.basename(rm_suffix(cfg.load_from))
    opath_pred_confs = osp.join(oprefix, 'pred_confs', '{}.npz'.format(oname))

    if osp.isfile(opath_pred_confs) and not cfg.force:
        data = np.load(opath_pred_confs)
        pred_confs = data['pred_confs']
        inst_num = data['inst_num']
        if inst_num != dataset.inst_num:
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(opath_pred_confs, inst_num, len(dataset)))
    else:
        pred_confs, gcn_feat = test(model, dataset, cfg, logger)
        inst_num = dataset.inst_num

    logger.info('pred_confs: mean({:.4f}). max({:.4f}), min({:.4f})'.format(
        pred_confs.mean(), pred_confs.max(), pred_confs.min()))

    logger.info('Convert to cluster')
    with Timer('Predition to peaks'):
        pred_dist2peak, pred_peaks = confidence_to_peaks(
            dataset.dists, dataset.nbrs, pred_confs, cfg.max_conn)

    if not dataset.ignore_label and cfg.eval_interim:
        # evaluate the intermediate results
        for i in range(cfg.max_conn):
            num = len(dataset.peaks)
            pred_peaks_i = np.arange(num)
            peaks_i = np.arange(num)
            for j in range(num):
                if len(pred_peaks[j]) > i:
                    pred_peaks_i[j] = pred_peaks[j][i]
                if len(dataset.peaks[j]) > i:
                    peaks_i[j] = dataset.peaks[j][i]
            acc = accuracy(pred_peaks_i, peaks_i)
            logger.info('[{}-th conn] accuracy of peak match: {:.4f}'.format(
                i + 1, acc))
            acc = 0.
            for idx, peak in enumerate(pred_peaks_i):
                acc += int(dataset.idx2lb[peak] == dataset.idx2lb[idx])
            acc /= len(pred_peaks_i)
            logger.info(
                '[{}-th conn] accuracy of peak label match: {:.4f}'.format(
                    i + 1, acc))

    with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau_0)):
        pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau_0,
                                      inst_num)

    if cfg.save_output:
        logger.info('save predicted confs to {}'.format(opath_pred_confs))
        mkdir_if_no_exists(opath_pred_confs)
        np.savez_compressed(opath_pred_confs,
                            pred_confs=pred_confs,
                            inst_num=inst_num)

        # save clustering results
        idx2lb = list2dict(pred_labels, ignore_value=-1)

        opath_pred_labels = osp.join(
            cfg.work_dir, folder, 'tau_{}_pred_labels.txt'.format(cfg.tau_0))
        logger.info('save predicted labels to {}'.format(opath_pred_labels))
        mkdir_if_no_exists(opath_pred_labels)
        write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        for metric in cfg.metrics:
            evaluate(dataset.gt_labels, pred_labels, metric)

    if cfg.use_gcn_feat:
        # gcn_feat is saved to disk for GCN-E
        opath_feat = osp.join(oprefix, 'features', '{}.bin'.format(oname))
        if not osp.isfile(opath_feat) or cfg.force:
            mkdir_if_no_exists(opath_feat)
            write_feat(opath_feat, gcn_feat)

        name = rm_suffix(osp.basename(opath_feat))
        prefix = oprefix
        ds = BasicDataset(name=name,
                          prefix=prefix,
                          dim=cfg.model['kwargs']['nhid'],
                          normalize=True)
        ds.info()

        # use top embedding of GCN to rebuild the kNN graph
        with Timer('connect to higher confidence with use_gcn_feat'):
            knn_prefix = osp.join(prefix, 'knns', name)
            knns = build_knns(knn_prefix,
                              ds.features,
                              cfg.knn_method,
                              cfg.knn,
                              is_rebuild=True)
            dists, nbrs = knns2ordered_nbrs(knns)

            pred_dist2peak, pred_peaks = confidence_to_peaks(
                dists, nbrs, pred_confs, cfg.max_conn)
            pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau,
                                          inst_num)

        # save clustering results
        if cfg.save_output:
            oname_meta = '{}_gcn_feat'.format(name)
            opath_pred_labels = osp.join(
                oprefix, oname_meta, 'tau_{}_pred_labels.txt'.format(cfg.tau))
            mkdir_if_no_exists(opath_pred_labels)

            idx2lb = list2dict(pred_labels, ignore_value=-1)
            write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

        # evaluation

        if not dataset.ignore_label:
            print('==> evaluation')
            for metric in cfg.metrics:
                evaluate(dataset.gt_labels, pred_labels, metric)
        import json
        import os
        import pdb
        pdb.set_trace()
        img_labels = json.load(
            open(r'/home/finn/research/data/clustering_data/test_index.json',
                 'r',
                 encoding='utf-8'))
        import shutil
        output = r'/home/finn/research/data/clustering_data/mr_gcn_output'
        for label in set(pred_labels):
            if not os.path.exists(os.path.join(output, f'cluter_{label}')):
                os.mkdir(os.path.join(output, f'cluter_{label}'))
        for image in img_labels:
            shutil.copy2(
                image,
                os.path.join(
                    os.path.join(output,
                                 f'cluter_{pred_labels[img_labels[image]]}'),
                    os.path.split(image)[-1]))
Exemplo n.º 11
0
    ds = BasicDataset(name=args.name,
                      prefix=args.prefix,
                      dim=args.dim,
                      normalize=args.no_normalize)
    ds.info()

    with Timer('[{}] build_knns'.format(args.knn_method)):
        if args.num_process is None:
            import multiprocessing as mp
            args.num_process = mp.cpu_count()
        print('use {} CPU for computation'.format(args.num_process))
        knn_prefix = os.path.join(args.prefix, 'knns', args.name)
        knns = build_knns(knn_prefix,
                          ds.features,
                          args.knn_method,
                          args.knn,
                          num_process=args.num_process)

    if args.test_all:
        with Timer('knns2spmat'):
            adj1 = knns2spmat(knns, args.knn, args.th_sim, use_sim=True)

        with Timer('fast_knns2spmat'):
            adj2 = fast_knns2spmat(knns, args.knn, args.th_sim, use_sim=True)

        print('#adj: {}, #adj2: {}, #non-eq: {}'.format(
            adj1.nnz, adj2.nnz, (adj1 != adj2).nnz))

        assert is_spmat_eq(adj1, adj2), "adj1 and adj2 are not equal"
        print('Output of knns2spmat and fast_knns2spmat are equal')
def chinese_whispers(feats, prefix, name, knn_method, knn, th_sim, iters,
                     **kwargs):
    """ Chinese Whispers Clustering Algorithm

    Paper: Chinese whispers: an efficient graph clustering algorithm
            and its application to natural language processing problems.
    Reference code:
        - http://alexloveless.co.uk/data/chinese-whispers-graph-clustering-in-python/
        - https://github.com/zhly0/facenet-face-cluster-chinese-whispers-
    """
    import networkx as nx

    assert len(feats) > 1

    with Timer('create graph'):
        knn_prefix = os.path.join(prefix, 'knns', name)
        knns = build_knns(knn_prefix, feats, knn_method, knn)
        spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True)

        size = len(feats)
        nodes = [(n_i, {'cluster': n_i}) for n_i in range(size)]
        c = spmat.tocoo()
        edges = [(n_i, n_j, {
            'weight': s
        }) for n_i, n_j, s in zip(c.row, c.col, c.data)]

        G = nx.Graph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)
        node_num = G.number_of_nodes()
        edge_num = G.number_of_edges()
        assert size == node_num
        print('#nodes: {}, #edges: {}'.format(node_num, edge_num))

    with Timer('whisper iteratively (iters={})'.format(iters)):
        cluster_nodes = list(G.nodes())
        for _ in range(iters):
            idxs = [i for i in range(node_num)]
            random.shuffle(idxs)
            for idx in idxs:
                node = cluster_nodes[idx]
                nbrs = G[node]
                if len(nbrs) == 0:
                    continue
                cluster2weight = {}
                for nbr in nbrs:
                    assigned_cluster = G.nodes[nbr]['cluster']
                    edge_weight = G[node][nbr]['weight']
                    if assigned_cluster not in cluster2weight:
                        cluster2weight[assigned_cluster] = 0
                    cluster2weight[assigned_cluster] += edge_weight

                # set the class of node to its neighbor with largest weight
                cluster2weight = sorted(cluster2weight.items(),
                                        key=lambda kv: kv[1],
                                        reverse=True)
                G.nodes[node]['cluster'] = cluster2weight[0][0]

    clusters = {}
    for (node, data) in G.nodes.items():
        assigned_cluster = data['cluster']

        if assigned_cluster not in clusters:
            clusters[assigned_cluster] = []
        clusters[assigned_cluster].append(node)

    print('#cluster: {}'.format(len(clusters)))
    labels = clusters2labels(clusters.values())
    labels = list(labels.values())

    return labels
Exemplo n.º 13
0
    def __init__(self, cfg):
        feat_path = cfg['feat_path']
        label_path = cfg.get('label_path', None)
        knn_graph_path = cfg.get('knn_graph_path', None)

        self.k = cfg['k']
        self.feature_dim = cfg['feature_dim']
        self.is_norm_feat = cfg.get('is_norm_feat', True)
        self.save_decomposed_adj = cfg.get('save_decomposed_adj', False)

        self.th_sim = cfg.get('th_sim', 0.)
        self.max_conn = cfg.get('max_conn', 1)
        self.conf_metric = cfg.get('conf_metric')
        self.num_process = cfg.get('num_process',16)

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.inst_num = len(self.idx2lb)
                self.gt_labels = intdict2ndarray(self.idx2lb)
                self.ignore_label = False
            else:
                self.inst_num = -1
                self.ignore_label = True
            self.features = read_probs(feat_path, self.inst_num,
                                       self.feature_dim)
            if self.is_norm_feat:
                self.features = l2norm(self.features)
            if self.inst_num == -1:
                self.inst_num = self.features.shape[0]
            self.size = 1 # take the entire graph as input

        with Timer('read knn graph'):
            if os.path.isfile(knn_graph_path):
                knns = np.load(knn_graph_path)['data']    # num_imgs*2*k
            else:
                if knn_graph_path is not None:
                    print('knn_graph_path does not exist: {}'.format(
                        knn_graph_path))
                knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name)
                # 通过faiss实现k近邻搜索,此处作者faiss_gpu版本实现可能有问题,但faiss大规模在cpu上跑还是慢
                # 当然faiss有针内存和计算速度方面的优化,PQ,IVF等,可参考faiss
                knns = build_knns(knn_prefix, self.features, cfg.knn_method,
                                  cfg.knn,self.num_process)
            # 依据k近邻搜索结果构建邻接矩阵
            adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True)

            # build symmetric adjacency matrix
            adj = build_symmetric_adj(adj, self_loop=True)
            adj = row_normalize(adj)
            if self.save_decomposed_adj:
                adj = sparse_mx_to_indices_values(adj)
                self.adj_indices, self.adj_values, self.adj_shape = adj
            else:
                self.adj = adj

            # convert knns to (dists, nbrs)
            self.dists, self.nbrs = knns2ordered_nbrs(knns)  # num_imgs*k

        print('feature shape: {}, k: {}, norm_feat: {}'.format(
            self.features.shape, self.k, self.is_norm_feat))

        if not self.ignore_label:
            with Timer('Prepare ground-truth label'):
                self.labels = confidence(feats=self.features,
                                         dists=self.dists,
                                         nbrs=self.nbrs,
                                         metric=self.conf_metric,
                                         idx2lb=self.idx2lb,
                                         lb2idxs=self.lb2idxs)
                if cfg.eval_interim:
                    _, self.peaks = confidence_to_peaks(
                        self.dists, self.nbrs, self.labels, self.max_conn)
Exemplo n.º 14
0
def train_gcn(model, cfg, logger):
    # prepare dataset
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.train_data, k, v)
    dataset = build_dataset(cfg.model['type'], cfg.train_data)
    pre_features = torch.FloatTensor(dataset.features)
    print('Have loaded the training data.')

    inst_num = dataset.inst_num
    feature_dim = dataset.feature_dim
    lb2idxs = dataset.lb2idxs
    center_fea = dataset.center_fea.astype('float32')
    cls_num, dim = center_fea.shape

    labels = torch.LongTensor(dataset.gt_labels)
    HEAD1 = HEAD(nhid=512)
    HEAD_test1 = HEAD_test(nhid=512)

    #load parameters from the pretrained model
    #model.load_state_dict(torch.load('./'))
    #HEAD1.load_state_dict(torch.load('./'), False)

    OPTIMIZER = optim.SGD([{'params': model.parameters(),'weight_decay':1e-5},
                           {'params': HEAD1.parameters(),'weight_decay':1e-5}], lr=0.01, momentum=0.9)
    print('the learning rate is 0.01')

    #model.load_state_dict(torch.load(''))
    #HEAD1.load_state_dict(torch.load(''))
    print("have load the pretrained model.")
    cfg.cuda = True
    model = model.cuda()
    HEAD1 = HEAD1.cuda()

    MODEL_ROOT = './src/train_model'
    print('the model save path is', MODEL_ROOT)

    #prepare the test data
    target = "part1_test"
    knn_path = "./data/knns/" + target + "/faiss_k_80.npz"
    knns = np.load(knn_path, allow_pickle=True)['data']
    inst_num = knns.shape[0]
    k_num = knns.shape[2]
    nbrs = knns[:, 0, :]
    pair_a = []
    pair_b = []
    for i in range(inst_num):
        pair_a.extend([i] * k_num)
        pair_b.extend(nbrs[i])


    for epoch in range(cfg.total_epochs):
        if epoch == cfg.STAGES[0]:  # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed
            schedule_lr(OPTIMIZER)
        if epoch == cfg.STAGES[1]:
            schedule_lr(OPTIMIZER)
        if epoch == cfg.STAGES[2]:
            schedule_lr(OPTIMIZER)

        model.train()
        HEAD1.train()

        index = faiss.IndexFlatIP(dim)
        index.add(center_fea)
        sims, cluster_id = index.search(center_fea, k=(cfg.cluster_num+200))  # search for the k-10 neighbor
        #sims, cluster_id = index.search(center_fea, k=cfg.cluster_num)  # search for the k-10 neighbor
        print('Have selected the cluster ids.')

        for batch in range(cls_num):
        #for batch in range(20):
            #0.select ids
            sample_cluster_id = random.sample(list(cluster_id[batch]), cfg.cluster_num)
            #sample_cluster_id = list(cluster_id[batch])
            sample_id = []#the idx of the samples in this batch
            for i in range(len(sample_cluster_id)):
                sample_id.extend(random.sample(lb2idxs[sample_cluster_id[i]],int(len(lb2idxs[sample_cluster_id[i]])*0.9)))
                #sample_id.extend(lb2idxs[sample_cluster_id[i]])
            #sample_id.sort()
            sample_num =len(sample_id)
            #id = list(np.arange(0,sample_num,1))
            #sample2sort = dict(zip(sample_id, id))
            if (sample_num>100000)|(sample_num<100):
                print('[too much samples] continue.')
                continue

            #1.create selected labels and images
            batch_labels = labels[sample_id]
            feature = pre_features[sample_id]
            print(sample_num)

            #2.create knn for this batch
            with Timer('build knn:'):
                knn_prefix = os.path.join("./data/rebuild_knn")
                if not os.path.exists(knn_prefix):
                    os.makedirs(knn_prefix)
                if os.path.exists(os.path.join(knn_prefix, 'faiss_k_80.npz')):
                    os.remove(os.path.join(knn_prefix, 'faiss_k_80.npz'))
                if os.path.exists(os.path.join(knn_prefix, 'faiss_k_80.index')):
                    os.remove(os.path.join(knn_prefix, 'faiss_k_80.index'))

                knns = build_knns(knn_prefix,
                                  #l2norm(feature.clone().detach().cpu().numpy()),
                                  l2norm(feature.numpy()),
                                  "faiss",
                                  80,
                                  is_rebuild=True)
                batch_adj = fast_knns2spmat(knns, 80, 0, use_sim=True)
                batch_adj = build_symmetric_adj(batch_adj, self_loop=True)
                batch_adj = row_normalize(batch_adj)
                batch_adj = sparse_mx_to_torch_sparse_tensor(batch_adj, return_idx=False)

            #3.put selected feature and labels to cuda
            batch_labels = batch_labels.cuda()
            feature = feature.cuda()
            batch_adj = batch_adj.cuda()
            train_data = [feature, batch_adj, batch_labels]
            #x = model(train_data)

            #4.train the model
            #add
            train_id_inst = batch_adj._indices().size()[1]
            #print('train_id_inst:', train_id_inst)
            #print('sample_num:', sample_num)
            #train_id_inst = sample_num
            rad_id = random.sample(range(0, train_id_inst), train_id_inst)+random.sample(range(0, train_id_inst), train_id_inst)
            patch_num = 40
            for i in range(patch_num*2):
                id = rad_id[i * int(train_id_inst / patch_num):(i + 1) * int(train_id_inst / patch_num)]
                x = model(train_data)
                loss = HEAD1(x, train_data, id)

                OPTIMIZER.zero_grad()
                loss.backward()
                OPTIMIZER.step()

                print(datetime.datetime.now())
                print('epoch:{}/{}, batch:{}/{}, batch2:{}/{},loss:{}'.format(epoch, cfg.total_epochs, batch, cls_num, i, patch_num*2, loss))

            if (batch+1)%100==0:
                if not os.path.exists(MODEL_ROOT):
                    os.makedirs(MODEL_ROOT)
                print('save model in epoch:{} batch:{} to {}'.format(epoch, batch, MODEL_ROOT))
                torch.save(model.state_dict(), os.path.join(MODEL_ROOT, "Backbone_Epoch_{}_batch_{}.pth".format(epoch + 1, batch)))
                torch.save(HEAD1.state_dict(), os.path.join(MODEL_ROOT, "Head_Epoch_{}_batch_{}.pth".format(epoch + 1, batch)))
            
            if (batch + 1) % 300 == 0:
                avg_acc = perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b)
                print('the avg testing acc in epoch:{} batch:{} is :'.format(epoch,batch), avg_acc)
                model.train()
                HEAD1.train()


        #5.test
        avg_acc = perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b)
        print('the avg testing acc in epoch:{} batch:{} is :'.format(epoch,batch), avg_acc)


        # 6.save model
        if not os.path.exists(MODEL_ROOT):
            os.makedirs(MODEL_ROOT)
        print('save model in epoch:{} batch:{} to {}'.format(epoch, batch, MODEL_ROOT))
        torch.save(model.state_dict(), os.path.join(MODEL_ROOT, "Backbone_Epoch_{}_batch_{}.pth".format(epoch + 1, batch)))
        torch.save(HEAD1.state_dict(), os.path.join(MODEL_ROOT, "Head_Epoch_{}_batch_{}.pth".format(epoch + 1, batch)))
Exemplo n.º 15
0
    def __init__(self, cfg):
        feat_path = cfg['feat_path']
        label_path = cfg.get('label_path', None)
        knn_graph_path = cfg.get('knn_graph_path', None)

        self.k = cfg['k']
        self.feature_dim = cfg['feature_dim']
        self.is_norm_feat = cfg.get('is_norm_feat', True)
        self.save_decomposed_adj = cfg.get('save_decomposed_adj', False)

        self.th_sim = cfg.get('th_sim', 0.)
        self.conf_metric = cfg.get('conf_metric')

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.inst_num = len(self.idx2lb)
                self.cls_num = len(self.lb2idxs)
                self.gt_labels = intdict2ndarray(self.idx2lb)
                self.ignore_label = False
            else:
                self.inst_num = -1
                self.ignore_label = True
            self.features = read_probs(feat_path, self.inst_num,
                                       self.feature_dim)

            if self.is_norm_feat:
                self.features = l2norm(self.features)
            if self.inst_num == -1:
                self.inst_num = self.features.shape[0]
            self.size = 1  # take the entire graph as input

        with Timer('Compute center feature'):
            self.center_fea = np.zeros((self.cls_num, self.features.shape[1]))
            for i in range(self.cls_num):
                self.center_fea[i] = np.mean(self.features[self.lb2idxs[i]], 0)
            self.center_fea = l2norm(self.center_fea)

        with Timer('read knn graph'):
            if os.path.isfile(knn_graph_path):
                print("load knns from the knn_path")
                self.knns = np.load(knn_graph_path)['data']
            else:
                if knn_graph_path is not None:
                    print('knn_graph_path does not exist: {}'.format(
                        knn_graph_path))
                knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name)
                self.knns = build_knns(knn_prefix, self.features,
                                       cfg.knn_method, cfg.knn)

            adj = fast_knns2spmat(self.knns, self.k, self.th_sim, use_sim=True)

            # build symmetric adjacency matrix
            adj = build_symmetric_adj(adj, self_loop=True)
            #print('adj before norm')
            #print(adj)
            adj = row_normalize(adj)
            if self.save_decomposed_adj:
                adj = sparse_mx_to_indices_values(adj)
                self.adj_indices, self.adj_values, self.adj_shape = adj
            else:
                self.adj = adj

            # convert knns to (dists, nbrs)
            self.dists, self.nbrs = knns2ordered_nbrs(self.knns)

        print('feature shape: {}, k: {}, norm_feat: {}'.format(
            self.features.shape, self.k, self.is_norm_feat))