def train_preprocessor(): print "train preproc", preproc_str d = xt.shape[1] t0 = time.time() if preproc_str.startswith('OPQ'): fi = preproc_str[3:-1].split('_') m = int(fi[0]) dout = int(fi[1]) if len(fi) == 2 else d preproc = faiss.OPQMatrix(d, m, dout) elif preproc_str.startswith('PCAR'): dout = int(preproc_str[4:-1]) preproc = faiss.PCAMatrix(d, dout, 0, True) else: assert False preproc.train(sanitize(xt[:1000000])) print "preproc train done in %.3f s" % (time.time() - t0) return preproc
def __init__(self, d): d2 = 256 nlist = 100 # numCentroids m = 8 # numQuantizers coarse_quantizer = faiss.IndexFlatL2(d2) sub_index = faiss.IndexIVFPQ(coarse_quantizer, d2, nlist, 16, 8) pca_matrix = faiss.PCAMatrix(d, d2, 0, True) self.index2 = faiss.IndexPreTransform(pca_matrix, sub_index) sub_index.own_fields = True coarse_quantizer.this.disown() self.sub_index = sub_index self.pca_matrix = pca_matrix self.index2.nprobe = 10
def test_pca(self): d = 64 n = 1000 np.random.seed(123) x = np.random.random(size=(n, d)).astype('float32') pca = faiss.PCAMatrix(d, 10) pca.train(x) y = pca.apply_py(x) # check that energy per component is decreasing column_norm2 = (y**2).sum(0) prev = 1e50 for o in column_norm2: self.assertGreater(prev, o) prev = o
def test_progressive_dim(self): d = 32 n = 10000 k = 50 xt, _, _ = get_dataset_2(d, n, 0, 0) # basic kmeans kmeans = faiss.Kmeans(d, k, gpu=True) kmeans.train(xt) pca = faiss.PCAMatrix(d, d) pca.train(xt) xt_pca = pca.apply(xt) # same test w/ Kmeans wrapper kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5, gpu=True) kmeans2.train(xt_pca) self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
def preprocess_features(npdata, pca): """Preprocess an array of features. Args: npdata (np.array N * ndim): features to preprocess pca (int): dim of output Returns: np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized """ nan_location = np.isnan(npdata) inf_location = np.isinf(npdata) if (not np.allclose(nan_location, 0)) or (not np.allclose(inf_location, 0)): print('before_Astype_Feature NaN or Inf found. Nan count: ', np.sum(nan_location), ' Inf count: ', np.sum(inf_location)) print( '###################### break ##################################' ) return npdata _, ndim = npdata.shape npdata = npdata.astype('float32') nan_location = np.isnan(npdata) inf_location = np.isinf(npdata) if (not np.allclose(nan_location, 0)) or (not np.allclose(inf_location, 0)): print('after_Astype_Feature NaN or Inf found. Nan count: ', np.sum(nan_location), ' Inf count: ', np.sum(inf_location)) print( '###################### break ##################################' ) return npdata # Apply PCA-whitening with Faiss mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5) mat.train(npdata) assert mat.is_trained npdata = mat.apply_py(npdata) # L2 normalization row_sums = np.linalg.norm(npdata, axis=1) npdata = npdata / row_sums[:, np.newaxis] return npdata
def kmeans_cluster(x, n_clusters, verbose=False, pcaDim=-1): """ K-means clustering using faiss library """ assert x.dim() == 2 n, d = x.size() X = x.numpy() if pcaDim > 0: if verbose: print(f'Applying PCA from {d}dimensions to {pcaDim}') pca = faiss.PCAMatrix(d, pcaDim) pca.train(X) assert pca.is_trained Xpca = pca.apply_py(X) else: Xpca = X if verbose: print('Clustering 2-dim tensor of size {}'.format(X.shape)) kmeans = faiss.Kmeans(Xpca.shape[1], n_clusters, niter=20, verbose=verbose) kmeans.train(Xpca) D, I = kmeans.index.search(Xpca, 1) return torch.LongTensor(I).squeeze()
def preprocess_features(npdata, pca=64): """Preprocess an array of features. Args: npdata (np.array N * ndim): features to preprocess pca (int): dim of output Returns: np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized """ _, ndim = npdata.shape npdata = npdata.astype('float32') # Apply PCA-whitening with Faiss mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5) mat.train(npdata) assert mat.is_trained npdata = mat.apply_py(npdata) # L2 normalization row_sums = np.linalg.norm(npdata, axis=1) npdata = npdata / row_sums[:, np.newaxis] return npdata
def main(): parser = get_parser() args = parser.parse_args() print("Reading features") x = np.load(args.data, mmap_mode="r") print("Computing PCA") pca = faiss.PCAMatrix(x.shape[-1], args.dim, args.eigen_power) pca.train(x) b = faiss.vector_to_array(pca.b) A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) os.makedirs(args.output, exist_ok=True) prefix = str(args.dim) if args.eigen_power != 0: prefix += f"_{args.eigen_power}" np.save(osp.join(args.output, f"{prefix}_pca_A"), A.T) np.save(osp.join(args.output, f"{prefix}_pca_b"), b)
def preprocess_features(x, d=256): """ Calculate PCA + Whitening + L2 normalization for each vector Args: x (ndarray): N x D, where N is number of vectors, D - dimensionality d (int): number of output dimensions (how many principal components to use). Returns: transformed [N x d] matrix xt . """ n, orig_d = x.shape pcaw = faiss.PCAMatrix(d_in=orig_d, d_out=d, eigen_power=-0.5, random_rotation=False) pcaw.train(x) assert pcaw.is_trained print 'Performing PCA + whitening' x = pcaw.apply_py(x) print 'x.shape after PCA + whitening:', x.shape l2normalization = faiss.NormalizationTransform(d, 2.0) print 'Performing L2 normalization' x = l2normalization.apply_py(x) return x
def train(self, vecs: np.ndarray, *args, **kwargs) -> None: import faiss num_samples, num_dim = vecs.shape assert self.output_dim <= num_samples, 'training PCA requires at least %d points, but %d was given' % ( self.output_dim, num_samples) assert self.output_dim < num_dim, 'PCA output dimension should < data dimension, received (%d, %d)' % ( self.output_dim, num_dim) pca = faiss.PCAMatrix(num_dim, self.output_dim) self.mean = np.mean(vecs, axis=0) # 1 x 768 pca.train(vecs) explained_variance_ratio = faiss.vector_to_array( pca.eigenvalues)[:self.output_dim] components = faiss.vector_to_array(pca.PCAMat).reshape( [-1, num_dim])[:self.output_dim] # permutate engive according to variance opt_order = get_perm(explained_variance_ratio, self.num_locals) comp_tmp = np.reshape(components[opt_order], [self.output_dim, num_dim]) self.pca_components = np.transpose(comp_tmp) # 768 x 200
def main(): # setting up the visible GPU os.environ['CUDA_VISIBLE_DEVICES'] = "1" # evaluate on test datasets dataset = "pet" cfg = configdataset("pet_show_alldatabase", get_data_root()) vectore_dir = 'best_model/se101_gem/model_epoch1/show_result' vecs = np.load( os.path.join(vectore_dir, "pet_show_alldatabase_vecs_ep1_resize.npy")) qvecs = np.load( os.path.join(vectore_dir, "pet_show_alldatabase_qvecs_ep1_resize.npy")) vecs = vecs.T qvecs = qvecs.T ori_dim = int(qvecs.shape[1]) out_dim = 256 # scores = np.dot(vecs, qvecs.T) # ranks = np.argsort(-scores, axis=0) # print(ranks.shape) # compute_map_and_print(dataset, ranks, cfg['gnd_id']) ##PCA method for test mat = faiss.PCAMatrix(ori_dim, out_dim) print(ori_dim, vecs.shape) mat.train(np.ascontiguousarray(vecs)) assert mat.is_trained qvecs_pca = mat.apply_py(np.ascontiguousarray(qvecs)) vecs_pca = mat.apply_py(np.ascontiguousarray(vecs)) print(qvecs_pca.shape) np.save( os.path.join(vectore_dir, "pet_show_alldatabase_vecs_ep1_resize_pca.npy"), vecs_pca) np.save( os.path.join(vectore_dir, "pet_show_alldatabase_qvecs_ep1_resize_pca.npy"), qvecs_pca)
def __init__(self, filename, name='RN', normalize=False, how_many=15000, st='test', indexes=None, distance='l2', preproc=None, **kwargs): super().__init__() self.rn_feats = self.load_features(filename, how_many, indexes) self.normalize = normalize self.st = st self.distance = distance self.indexes = indexes if normalize: self.rn_feats = utils.normalized(self.rn_feats, 1) self.name = name self.preproc = preproc if preproc == 'lsh': lsh_nbits = orig_dims // 2 if 'lsh_nbits' not in kwargs else kwargs[ 'lsh_nbits'] self.preproc_arg = lsh_nbits self.index = build_lsh(self.rn_feats, self.get_identifier(), lsh_nbits) if preproc == 'pca': orig_dims = self.rn_feats.shape[1] pca_dims = orig_dims // 2 if 'pca_dims' not in kwargs else kwargs[ 'pca_dims'] self.preproc_arg = pca_dims mat = faiss.PCAMatrix(self.rn_feats.shape[1], pca_dims) mat.train(self.rn_feats) assert mat.is_trained self.rn_feats = mat.apply_py(self.rn_feats) assert self.rn_feats.shape[1] == pca_dims print('PCA from {} to {}'.format(orig_dims, pca_dims))
def train_preprocessor(self, preproc_str_local, xt_local): if not self.preproc_cachefile or not os.path.exists( self.preproc_cachefile): print("train preproc", preproc_str_local) d = xt_local.shape[1] t0 = time.time() if preproc_str_local.startswith('OPQ'): fi = preproc_str_local[3:].split('_') m = int(fi[0]) dout = int(fi[1]) if len(fi) == 2 else d preproc = faiss.OPQMatrix(d, m, dout) elif preproc_str_local.startswith('PCAR'): dout = int(preproc_str_local[4:-1]) preproc = faiss.PCAMatrix(d, dout, 0, True) else: assert False preproc.train(indexfunctions.sanitize(xt_local[:100000000])) print("preproc train done in %.3f s" % (time.time() - t0)) faiss.write_VectorTransform(preproc, self.preproc_cachefile) else: print("load preproc ", self.preproc_cachefile) preproc = faiss.read_VectorTransform(self.preproc_cachefile) return preproc
def train_pca(hidden_states, target=100, max_train_sample_size=None): ''' Takes 2d array of hidden states. Will train PCA to reduce second dimension from n to target If max_train_sample_size is provided, a random sample of n from hidden states will be used to train. Returns: pcamatrix and bias ''' d1 = hidden_states.shape[1] pca = faiss.PCAMatrix(d1, target) if max_train_sample_size and hidden_states.shape[0] > max_train_sample_size: rnd_indices = np.random.choice(len(hidden_states), size=max_train_sample_size) hidden_states_train = hidden_states[rnd_indices] pca.train(np.array(hidden_states_train)) else: pca.train(np.array(hidden_states)) bias = faiss.vector_to_array(pca.b) pcamatrix = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) return (pcamatrix, bias)
def index_patches(patches, pca_dims=64): # settings for faiss: num_lists, M, num_bits = 200, 16, 8 # assertions: assert torch.is_tensor(patches) and patches.dim() == 2 assert type(pca_dims) == int and pca_dims > 0 if pca_dims > patches.size(1): print('WARNING: Input dimension < %d. Using fewer PCA dimensions.' % pca_dims) pca_dims = patches.size(1) - (patches.size(1) % M) # construct faiss index: quantizer = faiss.IndexFlatL2(pca_dims) assert pca_dims % M == 0 sub_index = faiss.IndexIVFPQ(quantizer, pca_dims, num_lists, M, num_bits) pca_matrix = faiss.PCAMatrix(patches.size(1), pca_dims, 0, True) faiss_index = faiss.IndexPreTransform(pca_matrix, sub_index) # train faiss index: patches = patches.numpy() faiss_index.train(patches) faiss_index.add(patches) return faiss_index, sub_index
yield res.get() res = res_next yield res.get() fv3_dir = os.getenv('DDIR') + '/features/' if 'train' in todo: f = h5py.File(fv3_dir + 'f100m/block0.hdf5', 'r') count = f['count'][0] labels = f['all_labels'][:count] features = f['all_feats'][:count] pca = faiss.PCAMatrix(2048, 256, 0, True) pca.train(features) faiss.write_VectorTransform(pca, fv3_dir + 'PCAR256.vt') if 'apply' in todo: pca = faiss.read_VectorTransform(fv3_dir + 'PCAR256.vt') def load_block(i): f = h5py.File(fv3_dir + 'f100m/block%d.hdf5' % i, 'r') count = f['count'][0] # labels = f['all_labels'][:count] features = f['all_feats'][:count] return features # one read thread, one PCA computation thread, and main thread writes result.
def preprocess_features(npdata, n_components=16, method='PCA', n_jobs=1): """Preprocess an array of features. Args: npdata (np.array N * ndim): features to preprocess pca (int): dim of output Returns: np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized """ _, ndim = npdata.shape npdata = npdata.astype('float32') # Apply PCA-whitening with Faiss if method == 'PCA': mat = faiss.PCAMatrix(ndim, n_components, eigen_power=-0.5) mat.train(npdata) assert mat.is_trained npdata = mat.apply_py(npdata) # Apply UMAP for dimensionality reduction elif method == 'UMAP': fit = UMAP(n_components=n_components, metric='cosine') npdata = np.ascontiguousarray(fit.fit_transform(npdata)) # Apply T-SNE for dimensionality reduction elif method == 'TSNE': if n_components > 3: X = sk_PCA().fit_transform(npdata) PCAinit = X[:, :n_components] / np.std(X[:, 0]) * 0.0001 fit = TSNE(n_components=n_components, init=PCAinit, n_jobs=n_jobs) npdata = np.ascontiguousarray(fit.fit_transform(npdata), dtype='float32') else: fit = sk_TSNE(n_components=n_components, metric='cosine', n_jobs=n_jobs) npdata = np.ascontiguousarray(fit.fit_transform(npdata)) # Apply adaptive T-SNE for dimensionality reduction elif method == 'AdaptiveTSNE': pca = sk_PCA().fit(npdata) # Find all the eigenvectors that explain 95% of the variance i = 0 s = 0 for j in range(len(pca.explained_variance_ratio_)): s += pca.explained_variance_ratio_[j] if s > 0.95: i = j # Prevent smaller than 8 if i < 8: i = 8 break # Fit and transform the data with the number of components that explain 95% pca95_well = sk_PCA(n_components=i).fit_transform(npdata) # Do a similarity measure with TSNE on the pca data if n_components > 3: PCAinit = pca95_well[:, :n_components] / np.std( pca95_well[:, 0]) * 0.0001 fit = TSNE(n_components=n_components, init=PCAinit, n_jobs=n_jobs) npdata = np.ascontiguousarray(fit.fit_transform(pca95_well)) else: fit = sk_TSNE(n_components=n_components, metric='cosine', n_jobs=n_jobs) npdata = np.ascontiguousarray(fit.fit_transform(pca95_well)) # L2 normalization row_sums = np.linalg.norm(npdata, axis=1) npdata = npdata / row_sums[:, np.newaxis] return npdata
def test_mm(self): # trouble with MKL+fbmake that appears only at runtime. Check it here x = np.random.random(size=(100, 20)).astype('float32') mat = faiss.PCAMatrix(20, 10) mat.train(x) mat.apply_py(x)
def run_pca(x, output_dimensionality): mat = faiss.PCAMatrix(x.shape[1], output_dimensionality) mat.train(x) assert mat.is_trained return mat.apply_py(x)
if train or pca: train_subset = np.empty((n_train_subset, FEATURES_NUMBER), dtype=np.float32) print("Adding {} train features for training".format(n_train_subset)) for label, features in label_features.items(): n_features = max(1, features.shape[0] // 5) train_subset[subset_i:subset_i+n_features] = features[:n_features] #for n_feature in range(n_features): # index_dict[subset_i+n_feature] = int(label) subset_i += n_features if pca: if os.path.exists(INDEX_FILENAME_PCA): mat = faiss.read_VectorTransform(INDEX_FILENAME_PCA) else: mat = faiss.PCAMatrix (FEATURES_NUMBER, PCA_FEATURES) print("PCA training... started") mat.train(train_subset) print("PCA training... finished") faiss.write_VectorTransform(mat, INDEX_FILENAME_PCA) if pca: print("PCA transformation... started") train_subset = mat.apply_py(train_subset) if pca else train_subset print("PCA transformation... finished") cpu_index = faiss.IndexFlatL2(PCA_FEATURES if pca else FEATURES_NUMBER) #cpu_index = faiss.index_factory(PCA_FEATURES if pca else FEATURES_NUMBER, "IVF4096,Flat") index = faiss.index_cpu_to_gpu(res, 0, cpu_index, co) if gpu else cpu_index#, co)
def main(): parser = get_parser() args = parser.parse_args() faiss_specs = parse_faiss_specs(args.faiss_specs) print("Faiss Specs:", faiss_specs) feat_path = osp.join(args.save_dir, "features") if osp.exists(feat_path + ".npy"): feats = np.load(feat_path + ".npy") else: generator, num = get_iterator(args) iterator = generator() feats = [] for f in tqdm.tqdm(iterator, total=num): feats.append(f) del iterator del generator feats = np.concatenate(feats) print(feats.shape) os.makedirs(args.save_dir, exist_ok=True) # np.save(feat_path, feats) gc.collect() torch.cuda.empty_cache() reload = False for spec in faiss_specs: print("Processing spec", spec) if reload: print("Reloading...") del feats gc.collect() feats = np.load(feat_path + ".npy") save_path = osp.join(args.save_dir, spec.spec_str) os.makedirs(save_path, exist_ok=True) d = feats.shape[-1] x = feats if spec.pca > 0: print("Computing PCA") pca = faiss.PCAMatrix(d, spec.pca) pca.train(x) d = spec.pca b = faiss.vector_to_array(pca.b) A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) np.save(osp.join(save_path, "pca_A"), A.T) np.save(osp.join(save_path, "pca_b"), b) print("Applying PCA") x = pca.apply_py(x) if spec.norm: reload = spec.pca <= 0 print("Normalizing") faiss.normalize_L2(x) print("Computing kmeans") kmeans = faiss.Kmeans( d, spec.n_clus, niter=50, verbose=True, spherical=spec.sphere, max_points_per_centroid=feats.shape[0], gpu=True, nredo=3, ) kmeans.train(x) np.save(osp.join(save_path, "centroids"), kmeans.centroids) del kmeans del x gc.collect()
def run_pca(x, output_dimensionality): x = c_f.to_numpy(x).astype(np.float32) mat = faiss.PCAMatrix(x.shape[1], output_dimensionality) mat.train(x) assert mat.is_trained return mat.apply_py(x)
author : h-j-13 time : 2018-6-20 ref : https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization """ import numpy import faiss numpy.random.seed(13) # 通过PCA将40维的向量缩减为10维 # =============测试数据============= mt = numpy.random.rand(1000, 40).astype('float32') print mt[0] mat = faiss.PCAMatrix(40, 10) # 40维缩减为10维 mat.train(mt) # 训练模型 assert mat.is_trained tr = mat.apply_py(mt) # print this to show that the magnitude of tr's columns is decreasing print tr[0] # print (tr ** 2).sum(0) # =============Python Console============= # [7.7770239e-01 2.3754121e-01 8.2427853e-01 9.6574920e-01 9.7260112e-01 # 4.5344925e-01 6.0904247e-01 7.7552652e-01 6.4161336e-01 7.2201824e-01 # 3.5036523e-02 2.9844946e-01 5.8512490e-02 8.5706097e-01 3.7285402e-01 # 6.7984796e-01 2.5627995e-01 3.4758121e-01 9.4127702e-03 3.5833380e-01 # 9.4909418e-01 2.1789901e-01 3.1939137e-01 9.1777241e-01 3.1903666e-02 # 6.5084539e-02 6.2982899e-01 8.7381345e-01 8.7157320e-03 7.4657726e-01 # 8.1284118e-01 7.5717449e-02 6.5645534e-01 5.0926220e-01 4.7988340e-01
if args.nt_sample == 0: xt_pca = xt[args.nt:args.nt + 10000] xt = xt[:args.nt] else: xt_pca = xt[args.nt_sample:args.nt_sample + 10000] rs = np.random.RandomState(args.seed) idx = rs.choice(args.nt_sample, size=args.nt, replace=False) xt = xt[idx] xb = xb[:args.nb] d = xb.shape[1] if args.pcadim != -1: print "training PCA: %d -> %d" % (d, args.pcadim) pca = faiss.PCAMatrix(d, args.pcadim) pca.train(sanitize(xt_pca)) xt = pca.apply_py(sanitize(xt)) xb = pca.apply_py(sanitize(xb)) d = xb.shape[1] ###################################################### # Run clustering ###################################################### index = faiss.IndexFlatL2(d) if ngpu > 0: print "moving index to GPU"