def test_wrapped_quantizer_HNSW(self): faiss.omp_set_num_threads(1) def bin2float(v): def byte2float(byte): return np.array( [-1.0 + 2.0 * (byte & (1 << b) != 0) for b in range(0, 8)]) return np.hstack([byte2float(byte) for byte in v]).astype('float32') def floatvec2nparray(v): return np.array([np.float32(v.at(i)) for i in range(0, v.size())]) \ .reshape(-1, d) d = 256 nt = 12800 nb = 10000 nq = 500 (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq) index_ref = faiss.IndexBinaryFlat(d) index_ref.add(xb) nlist = 256 clus = faiss.Clustering(d, nlist) clus_index = faiss.IndexFlatL2(d) xt_f = np.array([bin2float(v) for v in xt]) clus.train(xt_f, clus_index) centroids = floatvec2nparray(clus.centroids) hnsw_quantizer = faiss.IndexHNSWFlat(d, 32) hnsw_quantizer.add(centroids) hnsw_quantizer.is_trained = True wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer) assert nlist == hnsw_quantizer.ntotal assert nlist == wrapped_quantizer.ntotal assert wrapped_quantizer.is_trained index = faiss.IndexBinaryIVF(wrapped_quantizer, d, hnsw_quantizer.ntotal) index.nprobe = 128 assert index.is_trained index.add(xb) D_ref, I_ref = index_ref.search(xq, 10) D, I = index.search(xq, 10) recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \ / float(D_ref.shape[0]) assert recall > 0.77, "recall = %g" % recall
def cluster(self, points, k): """Clustering given points into k clusters""" index = self._faiss_index_flat_l2(points.shape[1]) clus = faiss.Clustering(points.shape[1], k) clus.verbose = False clus.niter = 10 clus.train(np.ascontiguousarray(points, dtype=np.float32), index) return faiss.vector_float_to_array(clus.centroids).reshape( clus.k, clus.d)
def test_redo(self): d = 64 n = 1000 rs = np.random.RandomState(123) x = rs.uniform(size=(n, d)).astype('float32') clus = faiss.Clustering(d, 20) clus.nredo = 1 clus.train(x, faiss.IndexFlatL2(d)) obj1 = faiss.vector_to_array(clus.obj) clus = faiss.Clustering(d, 20) clus.nredo = 10 clus.train(x, faiss.IndexFlatL2(d)) obj10 = faiss.vector_to_array(clus.obj) self.assertGreater(obj1[-1], obj10[-1])
def train_kmeans(x, num_clusters=1000, gpu_ids=None, niter=100, nredo=1, verbose=0): """ Runs k-means clustering on one or several GPUs """ assert np.all(~np.isnan(x)), 'x contains NaN' assert np.all(np.isfinite(x)), 'x contains Inf' if isinstance(gpu_ids, int): gpu_ids = [gpu_ids] assert gpu_ids is None or len(gpu_ids) d = x.shape[1] kmeans = faiss.Clustering(d, num_clusters) kmeans.verbose = bool(verbose) kmeans.niter = niter kmeans.nredo = nredo # otherwise the kmeans implementation sub-samples the training set kmeans.max_points_per_centroid = 10000000 if gpu_ids is not None: res = [faiss.StandardGpuResources() for i in gpu_ids] flat_config = [] for i in gpu_ids: cfg = faiss.GpuIndexFlatConfig() cfg.useFloat16 = False cfg.device = i flat_config.append(cfg) if len(gpu_ids) == 1: index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0]) else: indexes = [ faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(len(gpu_ids)) ] index = faiss.IndexProxy() for sub_index in indexes: index.addIndex(sub_index) else: index = faiss.IndexFlatL2(d) # perform the training kmeans.train(x, index) centroids = faiss.vector_float_to_array(kmeans.centroids) objective = faiss.vector_float_to_array(kmeans.obj) #logging.debug("Final objective: %.4g" % objective[-1]) return centroids.reshape(num_clusters, d)
def test_redo(self): d = 64 n = 1000 rs = np.random.RandomState(123) x = rs.uniform(size=(n, d)).astype('float32') # make sure that doing 10 redos yields a better objective than just 1 clus = faiss.Clustering(d, 20) clus.nredo = 1 clus.train(x, faiss.IndexFlatL2(d)) obj1 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj clus = faiss.Clustering(d, 20) clus.nredo = 10 clus.train(x, faiss.IndexFlatL2(d)) obj10 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj self.assertGreater(obj1, obj10)
def test_redo_cosine(self): # test redo with cosine distance (inner prod, so objectives are reversed) d = 64 n = 1000 rs = np.random.RandomState(123) x = rs.uniform(size=(n, d)).astype('float32') faiss.normalize_L2(x) # make sure that doing 10 redos yields a better objective than just 1 # for cosine distance, it is IP so higher is better clus = faiss.Clustering(d, 20) clus.nredo = 1 clus.train(x, faiss.IndexFlatIP(d)) obj1 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj clus = faiss.Clustering(d, 20) clus.nredo = 10 clus.train(x, faiss.IndexFlatIP(d)) obj10 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj self.assertGreater(obj10, obj1)
def kmeans(data, nmb_clusters, preprocess=True, verbose=False): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ if preprocess: x = preprocess_features(data) else: x = data n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) # Change faiss seed at each k-means so that the randomly picked # initialization centroids do not correspond to the same feature ids # from an epoch to another. clus.seed = np.random.randint(1234) clus.niter = 20 clus.max_points_per_centroid = 20000 res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, index) _, I = index.search(x, 1) # elahe https://github.com/facebookresearch/faiss/issues/1179 # losses = faiss.vector_to_array(clus.obj) stats = clus.iteration_stats losses = np.array([stats.at(i).obj for i in range(stats.size())]) if verbose: print('k-means loss evolution: {0}'.format(losses)) I = [int(n[0]) for n in I] images_lists = [[] for i in range(nmb_clusters)] for i in range(len(data)): images_lists[I[i]].append(i) return images_lists, losses[-1]
def test_wrapped_quantizer_HNSW(self): def bin2float2d(v): n, d = v.shape vf = ((v.reshape(-1, 1) >> np.arange(8)) & 1).astype("float32") vf *= 2 vf -= 1 return vf.reshape(n, d * 8) d = 256 nt = 12800 nb = 10000 nq = 500 (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq) index_ref = faiss.IndexBinaryFlat(d) index_ref.add(xb) nlist = 256 clus = faiss.Clustering(d, nlist) clus_index = faiss.IndexFlatL2(d) xt_f = bin2float2d(xt) clus.train(xt_f, clus_index) centroids = faiss.vector_to_array(clus.centroids).reshape(-1, clus.d) hnsw_quantizer = faiss.IndexHNSWFlat(d, 32) hnsw_quantizer.add(centroids) hnsw_quantizer.is_trained = True wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer) assert nlist == hnsw_quantizer.ntotal assert nlist == wrapped_quantizer.ntotal assert wrapped_quantizer.is_trained index = faiss.IndexBinaryIVF(wrapped_quantizer, d, hnsw_quantizer.ntotal) index.nprobe = 128 assert index.is_trained index.add(xb) D_ref, I_ref = index_ref.search(xq, 10) D, I = index.search(xq, 10) recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \ / float(D_ref.shape[0]) assert recall >= 0.77, "recall = %g" % recall
def update_pseudo_labels(self, model, dataloader, device): import faiss, time #### Reset Classifier Weights torch.cuda.empty_cache() self.classifier.weight.data.normal_(0, 1 / model.feature_dim) with torch.no_grad(): _ = model.eval() _ = model.to(device) memory_queue = [] for i, input_tuple in enumerate( tqdm(dataloader, 'Getting DC Embeddings...', total=len(dataloader))): embed = model(input_tuple[1].type( torch.FloatTensor).to(device))[-1] memory_queue.append(embed) memory_queue = torch.cat(memory_queue, dim=0).cpu().numpy() #PERFORM PCA print('Computing PCA... ', end='') start = time.time() pca_mat = faiss.PCAMatrix(memory_queue.shape[-1], self.red_dim) pca_mat.train(memory_queue) memory_queue = pca_mat.apply_py(memory_queue) print('Done in {}s.'.format(time.time() - start)) # # print('Computing Pseudolabels... ', end='') start = time.time() cpu_cluster_index = faiss.IndexFlatL2(memory_queue.shape[-1]) kmeans = faiss.Clustering(memory_queue.shape[-1], self.ncluster) kmeans.niter = 20 kmeans.min_points_per_centroid = 1 kmeans.max_points_per_centroid = 1000000000 ### Train Kmeans kmeans.train(memory_queue, cpu_cluster_index) centroids = faiss.vector_float_to_array(kmeans.centroids).reshape( self.ncluster, memory_queue.shape[-1]) ### faiss_search_index = faiss.IndexFlatL2(centroids.shape[-1]) faiss_search_index.add(centroids) _, computed_cluster_labels = faiss_search_index.search(memory_queue, 1) print('Done in {}s.'.format(time.time() - start)) ### self.pseudo_labels = computed_cluster_labels ### torch.cuda.empty_cache()
def run_kmeans_multi_gpu(x, nmb_clusters, verbose=False, seed=DEFAULT_KMEANS_SEED, gpu_device=0): """ Runs kmeans on multi GPUs. Args: ----- x: data nmb_clusters (int): number of clusters Returns: -------- list: ids of data in each cluster """ n_data, d = x.shape ngpus = len(gpu_device) assert ngpus > 1 # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 clus.seed = seed res = [faiss.StandardGpuResources() for i in range(ngpus)] flat_config = [] for i in gpu_device: cfg = faiss.GpuIndexFlatConfig() cfg.useFloat16 = False cfg.device = i flat_config.append(cfg) indexes = [ faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(ngpus) ] index = faiss.IndexReplicas() for sub_index in indexes: index.addIndex(sub_index) # perform the training clus.train(x, index) _, I = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) return [int(n[0]) for n in I], losses[-1]
def run_kmeans(x, nmb_clusters, verbose=False, seed=DEFAULT_KMEANS_SEED, gpu_device=0): """ Runs kmeans on 1 GPU. Args: ----- x: data nmb_clusters (int): number of clusters Returns: -------- list: ids of data in each cluster """ n_data, d = x.shape # niter = 20 # kmeans = faiss.Kmeans(d, nmb_clusters, niter=niter, verbose=verbose, gpu=True) # kmeans.train(x) # _, I = kmeans.index.search(x, 1) # return [int(n[0]) for n in I], 0 # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 clus.seed = seed res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = gpu_device index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, index) _, I = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) return [int(n[0]) for n in I], losses[-1]
def run_kmeans(self, x, nmb_clusters): _, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 # clus.max_points_per_centroid = 10000000 res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 self.index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, self.index) _, labels = self.index.search(x, 1) losses = faiss.vector_to_array(clus.obj) if self.verbose: print('k-means loss evolution: {0}'.format(losses)) return labels.ravel(), losses[-1]
def run_kmeans(x, nmb_clusters, verbose=True): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape print("x.shape:", x.shape) # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, index) _, I = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) centroids = faiss.vector_float_to_array(clus.centroids) print("centroids:") # print(clus.centroids) print("type:", type(centroids)) print("len:", len(centroids)) print("shape:", centroids.shape) # print(centroids) centroids_rs = centroids.reshape(nmb_clusters, d) print("centroids_reshape:") print("type:", type(centroids_rs)) print("len:", len(centroids_rs)) print("shape:", centroids_rs.shape) #print(centroids_rs) #assert 1 == 0 return [int(n[0]) for n in I], losses[-1]
def run_kmeans(x, nmb_clusters): device = x.device x = c_f.to_numpy(x).astype(np.float32) n_data, d = x.shape logging.info("running k-means clustering with k=%d" % nmb_clusters) logging.info("embedding dimensionality is %d" % d) # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 index = faiss.IndexFlatL2(d) if faiss.get_num_gpus() > 0: index = faiss.index_cpu_to_all_gpus(index) # perform the training clus.train(x, index) _, idxs = index.search(x, 1) return torch.tensor([int(n[0]) for n in idxs], dtype=int, device=device)
def train_coarse_quantizer(x, k, preproc): d = preproc.d_out clus = faiss.Clustering(d, k) clus.verbose = True # clus.niter = 2 clus.max_points_per_centroid = 10000000 print "apply preproc on shape", x.shape, 'k=', k t0 = time.time() x = preproc.apply_py(sanitize(x)) print " preproc %.3f s output shape %s" % (time.time() - t0, x.shape) vres, vdev = make_vres_vdev() index = faiss.index_cpu_to_gpu_multiple(vres, vdev, faiss.IndexFlatL2(d)) clus.train(x, index) centroids = faiss.vector_float_to_array(clus.centroids) return centroids.reshape(k, d)
def perform_clustering(step, features, accelerator): # num_cluster = [10000, 20000, 40000, 80000, 100000, 100000] num_cluster = [2500, 5000, 10000, 20000, 40000, 80000] if step>=0 and step<50000: i = step // 10000 d = features.shape[1] k = num_cluster[i] clus = faiss.Clustering(d, k) clus.verbose = False clus.niter = 20 clus.nredo = 5 clus.seed = 0 clus.max_points_per_centroid = 1000 clus.min_points_per_centroid = 1 if accelerator.is_local_main_process: clus.verbose = True res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = accelerator.local_process_index index = faiss.GpuIndexFlatL2(res, d, flat_config) features = features.astype('float32') clus.train(features, index) num_inst = features.shape[0] bsz = 16 nr_batch = int(math.ceil(num_inst / bsz)) D_list, I_list = [], [] for bidx in range(nr_batch): sidx = bidx * bsz eidx = min((bidx + 1) * bsz, num_inst) D, I = index.search(features[sidx:eidx], 1) D_list.append(D) I_list.append(I) idxs = np.concatenate(I_list) cluster_result = [int(n[0]) for n in idxs] else: cluster_result = [None for _ in range(features.shape[0])] torch.distributed.broadcast_object_list(cluster_result, src=0, group=None) cluster_result = torch.LongTensor(cluster_result).to(accelerator.device) return cluster_result else: return None
def run_kmeans(x, nmb_clusters, verbose=False): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ print("run kmeans begin") n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) # Change faiss seed at each k-means so that the randomly picked # initialization centroids do not correspond to the same feature ids # from an epoch to another. clus.seed = np.random.randint(1234) clus.niter = 20 clus.max_points_per_centroid = 10000000 res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) #pdb.set_trace() # perform the training clus.train(x, index) _, I = index.search(x, 1) losses = np.array([ clus.iteration_stats.at(i).obj for i in range(clus.iteration_stats.size()) ]) #losses = faiss.vector_to_array(obj) #pdb.set_trace() if verbose: print('k-means loss evolution: {0}'.format(losses)) return [int(n[0]) for n in I], losses[-1]
def train_kmeans(x, num_clusters=1000, num_gpus=1): """ Runs k-means clustering on one or several GPUs """ d = x.shape[1] kmeans = faiss.Clustering(d, num_clusters) kmeans.verbose = True kmeans.niter = 20 # otherwise the kmeans implementation sub-samples the training set kmeans.max_points_per_centroid = 10000000 res = [faiss.StandardGpuResources() for i in range(num_gpus)] flat_config = [] for i in range(num_gpus): cfg = faiss.GpuIndexFlatConfig() cfg.useFloat16 = False cfg.device = i flat_config.append(cfg) if num_gpus == 1: index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0]) else: indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(num_gpus)] index = faiss.IndexProxy() for sub_index in indexes: index.addIndex(sub_index) # perform the training kmeans.train(x, index) print 'Total number of indexed vectors (after kmeans.train()):', index.ntotal centroids = faiss.vector_float_to_array(kmeans.centroids) objective = faiss.vector_float_to_array(kmeans.obj) print 'Objective values per iter:', objective print "Final objective: %.4g" % objective[-1] # TODO: return cluster assignment return centroids.reshape(num_clusters, d)
def train_kmeans(x, k, ngpu, max_points_per_centroid=256): "Runs kmeans on one or several GPUs" d = x.shape[1] clus = faiss.Clustering(d, k) clus.verbose = True clus.niter = 20 clus.max_points_per_centroid = max_points_per_centroid if ngpu == 0: index = faiss.IndexFlatL2(d) else: res = [faiss.StandardGpuResources() for i in range(ngpu)] flat_config = [] for i in range(ngpu): cfg = faiss.GpuIndexFlatConfig() cfg.useFloat16 = False cfg.device = i flat_config.append(cfg) if ngpu == 1: index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0]) else: indexes = [ faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(ngpu) ] index = faiss.IndexReplicas() for sub_index in indexes: index.addIndex(sub_index) # perform the training clus.train(x, index) centroids = faiss.vector_float_to_array(clus.centroids) stats = clus.iteration_stats stats = [stats.at(i) for i in range(stats.size())] obj = np.array([st.obj for st in stats]) print("final objective: %.4g" % obj[-1]) return centroids.reshape(k, d)
def faiss_kmeans(train_feats, val_feats, nmb_clusters): train_feats = train_feats.numpy() val_feats = val_feats.numpy() d = train_feats.shape[-1] clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 index = faiss.IndexFlatL2(d) co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.shard = True index = faiss.index_cpu_to_all_gpus(index, co) clus.train(train_feats, index) _, train_a = index.search(train_feats, 1) _, val_a = index.search(val_feats, 1) return list(train_a[:, 0]), list(val_a[:, 0])
def run_kmeans(x, nmb_clusters): """ Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 cpu_index = faiss.IndexFlatL2(d) index = faiss.index_cpu_to_all_gpus(cpu_index) # perform the training clus.train(x, index) _, idxs = index.search(x, 1) return [int(n[0]) for n in idxs]
def kmeans(data, k, nrestarts=10, niters=100): """ Run k-means on the input data. """ data = np.ascontiguousarray(data.cpu().numpy()).astype('float32') d = data.shape[1] clus = faiss.Clustering(d, k) clus.verbose = False clus.niter = niters clus.nredo = nrestarts clus.seed = defaults.seed clus.spherical = False index = faiss.IndexFlatL2(d) clus.train(data, index) centroids = faiss.vector_float_to_array(clus.centroids).reshape(k, d) centroids = torch.Tensor(centroids).to(defaults.device) return centroids
def spherical_kmeans(data, k, nrestarts=10, niters=100): """ Run spherical k-means on the input data. """ data = np.ascontiguousarray(data.cpu().numpy()).astype('float32') data /= np.linalg.norm(data, axis=1)[:, np.newaxis] d = data.shape[1] clus = faiss.Clustering(d, k) clus.verbose = False clus.niter = niters clus.nredo = nrestarts clus.seed = defaults.seed clus.spherical = True index = faiss.IndexFlatIP(d) clus.train(data, index) centroids = faiss.vector_float_to_array(clus.centroids).reshape(k, d) centroids = torch.Tensor(centroids).to(defaults.device) return centroids / torch.norm(centroids, 2, 1).unsqueeze(1)
def train_kmeans(x, k, ngpu): "Runs kmeans on one or several GPUs" d = x.shape[1] clus = faiss.Clustering(d, k) clus.verbose = True clus.niter = 20 # otherwise the kmeans implementation sub-samples the training set clus.max_points_per_centroid = 10000000 res = [faiss.StandardGpuResources() for i in range(ngpu)] flat_config = [] for i in range(ngpu): cfg = faiss.GpuIndexFlatConfig() cfg.useFloat16 = False cfg.device = i flat_config.append(cfg) if ngpu == 1: index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0]) else: indexes = [ faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(ngpu) ] index = faiss.IndexProxy() for sub_index in indexes: index.addIndex(sub_index) # perform the training clus.train(x, index) centroids = faiss.vector_float_to_array(clus.centroids) obj = faiss.vector_float_to_array(clus.obj) print "final objective: %.4g" % obj[-1] return centroids.reshape(k, d)
def cluster(features, num_cluster): n_samples, dim = features.shape kmeans_clustering = faiss.Clustering(dim, num_cluster) kmeans_clustering.n_iter = 20 kmeans_clustering.max_points_per_centroid = 1000000000 gpu_resource = faiss.StandardGpuResources() gpu_flat = faiss.GpuIndexFlatConfig() gpu_flat.useFloat16 = False gpu_flat.device = 0 gpu_distance_measure = faiss.GpuIndexFlatL2(gpu_resource, dim, gpu_flat) kmeans_clustering.train(features, gpu_distance_measure) _, cluster_idxs = gpu_distance_measure.search(features, 1) losses = faiss.vector_to_array(kmeans_clustering.obj) image_list = [[] for i in range(num_cluster)] for i in range(len(features)): image_list[cluster_idxs[i][0]].append(i) return image_list, losses[-1]
def nmi(self, embeddings, labels): if isinstance(embeddings, list): embeddings = np.concatenate(embeddings, axis=0) labels = np.concatenate(labels, axis=0) faiss_search_index = faiss.IndexFlatL2(embeddings.shape[-1]) faiss_cluster_index = faiss.Clustering(embeddings.shape[-1], self.num_classes) faiss_cluster_index.n_iter, faiss_cluster_index.min_points_per_centroid, faiss_cluster_index.max_points_per_centroid = 20, 1, 1000000000 # faiss_cluster_index.train(embeddings, faiss_search_index) embedding_centroids = faiss.vector_float_to_array( faiss_cluster_index.centroids).reshape(self.num_classes, embeddings.shape[-1]) # faiss_search_index = faiss.IndexFlatL2(embedding_centroids.shape[-1]) faiss_search_index.add(embedding_centroids) _, centroids_cluster_labels = faiss_search_index.search(embeddings, 1) # NMI = metrics.cluster.normalized_mutual_info_score( centroids_cluster_labels.reshape(-1), labels.reshape(-1)) # return [NMI]
def run_kmeans(x, nmb_clusters, verbose=False): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 10 clus.max_points_per_centroid = 10000000 clus.verbose = True res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, index) _, I = index.search(x, 1) centroids = np.array(faiss.vector_to_array(clus.centroids)) centroids = np.reshape(centroids, (-1, 64)) centd = pdist(centroids) centroids2d = TSNE(n_components=2).fit_transform(centroids) plt.scatter(centroids2d[:, 0], centroids2d[:, 1]) axes = plt.gca() axes.set_xlim([-50, 50]) axes.set_ylim([-50, 50]) losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) return [int(n[0]) for n in I], losses[-1], plt, np.mean(centd)
def run_kmeans(x, nmb_clusters, verbose=False): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.nredo = 10 clus.max_points_per_centroid = int(3 * (n_data / nmb_clusters)) #10000000 res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) # import pdb; pdb.set_trace() # perform the training clus.train(x, index) D, I = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) # import pdb; pdb.set_trace() return [int(n[0]) for n in I], [float(n[0]) for n in D ], losses[-1], clus, index, flat_config
def run_kmeans(x, nmb_clusters): n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) # Change faiss seed at each k-means so that the randomly picked # initialization centroids do not correspond to the same feature ids # from an epoch to another. clus.seed = np.random.randint(1234) clus.niter = 20 clus.max_points_per_centroid = 10000000 res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, index) _, I = index.search(x, 1) return [int(n[0]) for n in I]
def run_kmeans(x, nmb_clusters): """ Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape logging.info("running k-means clustering with k=%d" % nmb_clusters) logging.info("embedding dimensionality is %d" % d) # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 index = faiss.IndexFlatL2(d) if faiss.get_num_gpus() > 0: index = faiss.index_cpu_to_all_gpus(index) # perform the training clus.train(x, index) _, idxs = index.search(x, 1) return [int(n[0]) for n in idxs]