예제 #1
0
def train_preprocessor():
    print "train preproc", preproc_str
    d = xt.shape[1]
    t0 = time.time()
    if preproc_str.startswith('OPQ'):
        fi = preproc_str[3:-1].split('_')
        m = int(fi[0])
        dout = int(fi[1]) if len(fi) == 2 else d
        preproc = faiss.OPQMatrix(d, m, dout)
    elif preproc_str.startswith('PCAR'):
        dout = int(preproc_str[4:-1])
        preproc = faiss.PCAMatrix(d, dout, 0, True)
    else:
        assert False
    preproc.train(sanitize(xt[:1000000]))
    print "preproc train done in %.3f s" % (time.time() - t0)
    return preproc
예제 #2
0
    def __init__(self, d):
        d2 = 256
        nlist = 100  # numCentroids
        m = 8  # numQuantizers

        coarse_quantizer = faiss.IndexFlatL2(d2)
        sub_index = faiss.IndexIVFPQ(coarse_quantizer, d2, nlist, 16, 8)
        pca_matrix = faiss.PCAMatrix(d, d2, 0, True)
        self.index2 = faiss.IndexPreTransform(pca_matrix, sub_index)

        sub_index.own_fields = True
        coarse_quantizer.this.disown()

        self.sub_index = sub_index
        self.pca_matrix = pca_matrix

        self.index2.nprobe = 10
예제 #3
0
    def test_pca(self):
        d = 64
        n = 1000
        np.random.seed(123)
        x = np.random.random(size=(n, d)).astype('float32')

        pca = faiss.PCAMatrix(d, 10)
        pca.train(x)
        y = pca.apply_py(x)

        # check that energy per component is decreasing
        column_norm2 = (y**2).sum(0)

        prev = 1e50
        for o in column_norm2:
            self.assertGreater(prev, o)
            prev = o
예제 #4
0
    def test_progressive_dim(self):
        d = 32
        n = 10000
        k = 50
        xt, _, _ = get_dataset_2(d, n, 0, 0)

        # basic kmeans
        kmeans = faiss.Kmeans(d, k, gpu=True)
        kmeans.train(xt)

        pca = faiss.PCAMatrix(d, d)
        pca.train(xt)
        xt_pca = pca.apply(xt)

        # same test w/ Kmeans wrapper
        kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5, gpu=True)
        kmeans2.train(xt_pca)
        self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
예제 #5
0
def preprocess_features(npdata, pca):
    """Preprocess an array of features.
    Args:
        npdata (np.array N * ndim): features to preprocess
        pca (int): dim of output
    Returns:
        np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
    """
    nan_location = np.isnan(npdata)
    inf_location = np.isinf(npdata)
    if (not np.allclose(nan_location, 0)) or (not np.allclose(inf_location,
                                                              0)):
        print('before_Astype_Feature NaN or Inf found. Nan count: ',
              np.sum(nan_location), ' Inf count: ', np.sum(inf_location))
        print(
            '######################  break  ##################################'
        )
        return npdata

    _, ndim = npdata.shape
    npdata = npdata.astype('float32')

    nan_location = np.isnan(npdata)
    inf_location = np.isinf(npdata)
    if (not np.allclose(nan_location, 0)) or (not np.allclose(inf_location,
                                                              0)):
        print('after_Astype_Feature NaN or Inf found. Nan count: ',
              np.sum(nan_location), ' Inf count: ', np.sum(inf_location))
        print(
            '######################  break  ##################################'
        )
        return npdata

    # Apply PCA-whitening with Faiss
    mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5)
    mat.train(npdata)
    assert mat.is_trained
    npdata = mat.apply_py(npdata)

    # L2 normalization
    row_sums = np.linalg.norm(npdata, axis=1)
    npdata = npdata / row_sums[:, np.newaxis]

    return npdata
예제 #6
0
def kmeans_cluster(x, n_clusters, verbose=False, pcaDim=-1):
    """
    K-means clustering using faiss library
    """
    assert x.dim() == 2
    n, d = x.size()
    X = x.numpy()
    if pcaDim > 0:
        if verbose: print(f'Applying PCA from {d}dimensions to {pcaDim}')
        pca = faiss.PCAMatrix(d, pcaDim)
        pca.train(X)
        assert pca.is_trained
        Xpca = pca.apply_py(X)
    else:
        Xpca = X
    if verbose: print('Clustering 2-dim tensor of size {}'.format(X.shape))

    kmeans = faiss.Kmeans(Xpca.shape[1], n_clusters, niter=20, verbose=verbose)
    kmeans.train(Xpca)
    D, I = kmeans.index.search(Xpca, 1)
    return torch.LongTensor(I).squeeze()
def preprocess_features(npdata, pca=64):
    """Preprocess an array of features.
    Args:
        npdata (np.array N * ndim): features to preprocess
        pca (int): dim of output
    Returns:
        np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
    """
    _, ndim = npdata.shape
    npdata = npdata.astype('float32')

    # Apply PCA-whitening with Faiss
    mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5)
    mat.train(npdata)
    assert mat.is_trained
    npdata = mat.apply_py(npdata)

    # L2 normalization
    row_sums = np.linalg.norm(npdata, axis=1)
    npdata = npdata / row_sums[:, np.newaxis]
    return npdata
예제 #8
0
파일: pca.py 프로젝트: sdadas/fairseq
def main():
    parser = get_parser()
    args = parser.parse_args()

    print("Reading features")
    x = np.load(args.data, mmap_mode="r")

    print("Computing PCA")
    pca = faiss.PCAMatrix(x.shape[-1], args.dim, args.eigen_power)
    pca.train(x)
    b = faiss.vector_to_array(pca.b)
    A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)

    os.makedirs(args.output, exist_ok=True)

    prefix = str(args.dim)
    if args.eigen_power != 0:
        prefix += f"_{args.eigen_power}"

    np.save(osp.join(args.output, f"{prefix}_pca_A"), A.T)
    np.save(osp.join(args.output, f"{prefix}_pca_b"), b)
예제 #9
0
def preprocess_features(x, d=256):
    """
    Calculate PCA + Whitening + L2 normalization for each vector

    Args:
        x (ndarray): N x D, where N is number of vectors, D - dimensionality
        d (int): number of output dimensions (how many principal components to use).
    Returns:
        transformed [N x d] matrix xt .
    """
    n, orig_d = x.shape
    pcaw = faiss.PCAMatrix(d_in=orig_d, d_out=d, eigen_power=-0.5, random_rotation=False)
    pcaw.train(x)
    assert pcaw.is_trained
    print 'Performing PCA + whitening'
    x = pcaw.apply_py(x)
    print 'x.shape after PCA + whitening:', x.shape
    l2normalization = faiss.NormalizationTransform(d, 2.0)
    print 'Performing L2 normalization'
    x = l2normalization.apply_py(x)
    return x
예제 #10
0
파일: pca.py 프로젝트: leolorenzoluis/gnes
    def train(self, vecs: np.ndarray, *args, **kwargs) -> None:
        import faiss
        num_samples, num_dim = vecs.shape
        assert self.output_dim <= num_samples, 'training PCA requires at least %d points, but %d was given' % (
            self.output_dim, num_samples)
        assert self.output_dim < num_dim, 'PCA output dimension should < data dimension, received (%d, %d)' % (
            self.output_dim, num_dim)

        pca = faiss.PCAMatrix(num_dim, self.output_dim)
        self.mean = np.mean(vecs, axis=0)  # 1 x 768
        pca.train(vecs)
        explained_variance_ratio = faiss.vector_to_array(
            pca.eigenvalues)[:self.output_dim]
        components = faiss.vector_to_array(pca.PCAMat).reshape(
            [-1, num_dim])[:self.output_dim]

        # permutate engive according to variance
        opt_order = get_perm(explained_variance_ratio, self.num_locals)
        comp_tmp = np.reshape(components[opt_order],
                              [self.output_dim, num_dim])

        self.pca_components = np.transpose(comp_tmp)  # 768 x 200
예제 #11
0
def main():
    # setting up the visible GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = "1"
    # evaluate on test datasets
    dataset = "pet"
    cfg = configdataset("pet_show_alldatabase", get_data_root())
    vectore_dir = 'best_model/se101_gem/model_epoch1/show_result'
    vecs = np.load(
        os.path.join(vectore_dir, "pet_show_alldatabase_vecs_ep1_resize.npy"))
    qvecs = np.load(
        os.path.join(vectore_dir, "pet_show_alldatabase_qvecs_ep1_resize.npy"))
    vecs = vecs.T
    qvecs = qvecs.T
    ori_dim = int(qvecs.shape[1])
    out_dim = 256

    # scores = np.dot(vecs, qvecs.T)
    # ranks = np.argsort(-scores, axis=0)
    # print(ranks.shape)
    # compute_map_and_print(dataset, ranks, cfg['gnd_id'])

    ##PCA method for test

    mat = faiss.PCAMatrix(ori_dim, out_dim)
    print(ori_dim, vecs.shape)
    mat.train(np.ascontiguousarray(vecs))
    assert mat.is_trained
    qvecs_pca = mat.apply_py(np.ascontiguousarray(qvecs))
    vecs_pca = mat.apply_py(np.ascontiguousarray(vecs))
    print(qvecs_pca.shape)

    np.save(
        os.path.join(vectore_dir,
                     "pet_show_alldatabase_vecs_ep1_resize_pca.npy"), vecs_pca)
    np.save(
        os.path.join(vectore_dir,
                     "pet_show_alldatabase_qvecs_ep1_resize_pca.npy"),
        qvecs_pca)
예제 #12
0
    def __init__(self,
                 filename,
                 name='RN',
                 normalize=False,
                 how_many=15000,
                 st='test',
                 indexes=None,
                 distance='l2',
                 preproc=None,
                 **kwargs):
        super().__init__()

        self.rn_feats = self.load_features(filename, how_many, indexes)
        self.normalize = normalize
        self.st = st
        self.distance = distance
        self.indexes = indexes
        if normalize:
            self.rn_feats = utils.normalized(self.rn_feats, 1)
        self.name = name
        self.preproc = preproc
        if preproc == 'lsh':
            lsh_nbits = orig_dims // 2 if 'lsh_nbits' not in kwargs else kwargs[
                'lsh_nbits']
            self.preproc_arg = lsh_nbits
            self.index = build_lsh(self.rn_feats, self.get_identifier(),
                                   lsh_nbits)
        if preproc == 'pca':
            orig_dims = self.rn_feats.shape[1]
            pca_dims = orig_dims // 2 if 'pca_dims' not in kwargs else kwargs[
                'pca_dims']
            self.preproc_arg = pca_dims
            mat = faiss.PCAMatrix(self.rn_feats.shape[1], pca_dims)
            mat.train(self.rn_feats)
            assert mat.is_trained
            self.rn_feats = mat.apply_py(self.rn_feats)
            assert self.rn_feats.shape[1] == pca_dims
            print('PCA from {} to {}'.format(orig_dims, pca_dims))
 def train_preprocessor(self, preproc_str_local, xt_local):
     if not self.preproc_cachefile or not os.path.exists(
             self.preproc_cachefile):
         print("train preproc", preproc_str_local)
         d = xt_local.shape[1]
         t0 = time.time()
         if preproc_str_local.startswith('OPQ'):
             fi = preproc_str_local[3:].split('_')
             m = int(fi[0])
             dout = int(fi[1]) if len(fi) == 2 else d
             preproc = faiss.OPQMatrix(d, m, dout)
         elif preproc_str_local.startswith('PCAR'):
             dout = int(preproc_str_local[4:-1])
             preproc = faiss.PCAMatrix(d, dout, 0, True)
         else:
             assert False
         preproc.train(indexfunctions.sanitize(xt_local[:100000000]))
         print("preproc train done in %.3f s" % (time.time() - t0))
         faiss.write_VectorTransform(preproc, self.preproc_cachefile)
     else:
         print("load preproc ", self.preproc_cachefile)
         preproc = faiss.read_VectorTransform(self.preproc_cachefile)
     return preproc
예제 #14
0
def train_pca(hidden_states, target=100, max_train_sample_size=None):
    '''
    Takes 2d array of hidden states. 
    Will train PCA to reduce second dimension from n to target

    If max_train_sample_size is provided, a random sample of n from hidden states will be used to train. 

    Returns: pcamatrix and bias
    '''
    d1 = hidden_states.shape[1]

    pca = faiss.PCAMatrix(d1, target)
    if max_train_sample_size and hidden_states.shape[0] > max_train_sample_size:
        rnd_indices = np.random.choice(len(hidden_states),
                                       size=max_train_sample_size)
        hidden_states_train = hidden_states[rnd_indices]
        pca.train(np.array(hidden_states_train))

    else:
        pca.train(np.array(hidden_states))

    bias = faiss.vector_to_array(pca.b)
    pcamatrix = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)
    return (pcamatrix, bias)
def index_patches(patches, pca_dims=64):

    # settings for faiss:
    num_lists, M, num_bits = 200, 16, 8

    # assertions:
    assert torch.is_tensor(patches) and patches.dim() == 2
    assert type(pca_dims) == int and pca_dims > 0
    if pca_dims > patches.size(1):
        print('WARNING: Input dimension < %d. Using fewer PCA dimensions.' % pca_dims)
        pca_dims = patches.size(1) - (patches.size(1) % M)

    # construct faiss index:
    quantizer = faiss.IndexFlatL2(pca_dims)
    assert pca_dims % M == 0
    sub_index = faiss.IndexIVFPQ(quantizer, pca_dims, num_lists, M, num_bits)
    pca_matrix = faiss.PCAMatrix(patches.size(1), pca_dims, 0, True)
    faiss_index = faiss.IndexPreTransform(pca_matrix, sub_index)

    # train faiss index:
    patches = patches.numpy()
    faiss_index.train(patches)
    faiss_index.add(patches)
    return faiss_index, sub_index
            yield res.get()
        res = res_next
    yield res.get()


fv3_dir = os.getenv('DDIR') + '/features/'

if 'train' in todo:

    f = h5py.File(fv3_dir + 'f100m/block0.hdf5', 'r')

    count = f['count'][0]
    labels = f['all_labels'][:count]
    features = f['all_feats'][:count]

    pca = faiss.PCAMatrix(2048, 256, 0, True)

    pca.train(features)
    faiss.write_VectorTransform(pca, fv3_dir + 'PCAR256.vt')

if 'apply' in todo:
    pca = faiss.read_VectorTransform(fv3_dir + 'PCAR256.vt')

    def load_block(i):
        f = h5py.File(fv3_dir + 'f100m/block%d.hdf5' % i, 'r')
        count = f['count'][0]
        # labels = f['all_labels'][:count]
        features = f['all_feats'][:count]
        return features

    # one read thread, one PCA computation thread, and main thread writes result.
예제 #17
0
def preprocess_features(npdata, n_components=16, method='PCA', n_jobs=1):
    """Preprocess an array of features.
    Args:
        npdata (np.array N * ndim): features to preprocess
        pca (int): dim of output
    Returns:
        np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
    """
    _, ndim = npdata.shape
    npdata = npdata.astype('float32')

    # Apply PCA-whitening with Faiss
    if method == 'PCA':
        mat = faiss.PCAMatrix(ndim, n_components, eigen_power=-0.5)
        mat.train(npdata)
        assert mat.is_trained
        npdata = mat.apply_py(npdata)
    # Apply UMAP for dimensionality reduction
    elif method == 'UMAP':
        fit = UMAP(n_components=n_components, metric='cosine')
        npdata = np.ascontiguousarray(fit.fit_transform(npdata))
    # Apply T-SNE for dimensionality reduction
    elif method == 'TSNE':
        if n_components > 3:
            X = sk_PCA().fit_transform(npdata)
            PCAinit = X[:, :n_components] / np.std(X[:, 0]) * 0.0001
            fit = TSNE(n_components=n_components, init=PCAinit, n_jobs=n_jobs)
            npdata = np.ascontiguousarray(fit.fit_transform(npdata),
                                          dtype='float32')
        else:
            fit = sk_TSNE(n_components=n_components,
                          metric='cosine',
                          n_jobs=n_jobs)
            npdata = np.ascontiguousarray(fit.fit_transform(npdata))
    # Apply adaptive T-SNE for dimensionality reduction
    elif method == 'AdaptiveTSNE':
        pca = sk_PCA().fit(npdata)

        # Find all the eigenvectors that explain 95% of the variance
        i = 0
        s = 0
        for j in range(len(pca.explained_variance_ratio_)):
            s += pca.explained_variance_ratio_[j]
            if s > 0.95:
                i = j
                # Prevent smaller than 8
                if i < 8:
                    i = 8
                break

        # Fit and transform the data with the number of components that explain 95%
        pca95_well = sk_PCA(n_components=i).fit_transform(npdata)

        # Do a similarity measure with TSNE on the pca data
        if n_components > 3:
            PCAinit = pca95_well[:, :n_components] / np.std(
                pca95_well[:, 0]) * 0.0001
            fit = TSNE(n_components=n_components, init=PCAinit, n_jobs=n_jobs)
            npdata = np.ascontiguousarray(fit.fit_transform(pca95_well))
        else:
            fit = sk_TSNE(n_components=n_components,
                          metric='cosine',
                          n_jobs=n_jobs)
            npdata = np.ascontiguousarray(fit.fit_transform(pca95_well))

    # L2 normalization
    row_sums = np.linalg.norm(npdata, axis=1)
    npdata = npdata / row_sums[:, np.newaxis]
    return npdata
예제 #18
0
 def test_mm(self):
     # trouble with MKL+fbmake that appears only at runtime. Check it here
     x = np.random.random(size=(100, 20)).astype('float32')
     mat = faiss.PCAMatrix(20, 10)
     mat.train(x)
     mat.apply_py(x)
예제 #19
0
def run_pca(x, output_dimensionality):
    mat = faiss.PCAMatrix(x.shape[1], output_dimensionality)
    mat.train(x)
    assert mat.is_trained
    return mat.apply_py(x)
    if train or pca:
        train_subset = np.empty((n_train_subset, FEATURES_NUMBER), dtype=np.float32)
        
        print("Adding {} train features for training".format(n_train_subset))
        for label, features in label_features.items():
            n_features = max(1, features.shape[0] // 5)
            train_subset[subset_i:subset_i+n_features] = features[:n_features]
            #for n_feature in range(n_features):
            #    index_dict[subset_i+n_feature] = int(label)
            subset_i += n_features

    if pca:
        if os.path.exists(INDEX_FILENAME_PCA):
            mat = faiss.read_VectorTransform(INDEX_FILENAME_PCA)
        else:
            mat = faiss.PCAMatrix (FEATURES_NUMBER, PCA_FEATURES)

            print("PCA training... started")
            mat.train(train_subset)
            print("PCA training... finished")
            
            faiss.write_VectorTransform(mat, INDEX_FILENAME_PCA)

    if pca:
        print("PCA transformation... started")
        train_subset = mat.apply_py(train_subset) if pca else train_subset
        print("PCA transformation... finished")

    cpu_index = faiss.IndexFlatL2(PCA_FEATURES if pca else FEATURES_NUMBER) 
    #cpu_index =  faiss.index_factory(PCA_FEATURES if pca else FEATURES_NUMBER, "IVF4096,Flat")
    index = faiss.index_cpu_to_gpu(res, 0, cpu_index, co) if gpu else cpu_index#, co)
예제 #21
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    faiss_specs = parse_faiss_specs(args.faiss_specs)
    print("Faiss Specs:", faiss_specs)

    feat_path = osp.join(args.save_dir, "features")
    if osp.exists(feat_path + ".npy"):
        feats = np.load(feat_path + ".npy")
    else:
        generator, num = get_iterator(args)
        iterator = generator()

        feats = []
        for f in tqdm.tqdm(iterator, total=num):
            feats.append(f)

        del iterator
        del generator

        feats = np.concatenate(feats)

        print(feats.shape)

        os.makedirs(args.save_dir, exist_ok=True)
        # np.save(feat_path, feats)

        gc.collect()
        torch.cuda.empty_cache()

    reload = False
    for spec in faiss_specs:
        print("Processing spec", spec)

        if reload:
            print("Reloading...")
            del feats
            gc.collect()
            feats = np.load(feat_path + ".npy")

        save_path = osp.join(args.save_dir, spec.spec_str)
        os.makedirs(save_path, exist_ok=True)
        d = feats.shape[-1]
        x = feats
        if spec.pca > 0:
            print("Computing PCA")
            pca = faiss.PCAMatrix(d, spec.pca)
            pca.train(x)
            d = spec.pca
            b = faiss.vector_to_array(pca.b)
            A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)
            np.save(osp.join(save_path, "pca_A"), A.T)
            np.save(osp.join(save_path, "pca_b"), b)
            print("Applying PCA")
            x = pca.apply_py(x)

        if spec.norm:
            reload = spec.pca <= 0
            print("Normalizing")
            faiss.normalize_L2(x)

        print("Computing kmeans")
        kmeans = faiss.Kmeans(
            d,
            spec.n_clus,
            niter=50,
            verbose=True,
            spherical=spec.sphere,
            max_points_per_centroid=feats.shape[0],
            gpu=True,
            nredo=3,
        )
        kmeans.train(x)
        np.save(osp.join(save_path, "centroids"), kmeans.centroids)
        del kmeans
        del x
        gc.collect()
def run_pca(x, output_dimensionality):
    x = c_f.to_numpy(x).astype(np.float32)
    mat = faiss.PCAMatrix(x.shape[1], output_dimensionality)
    mat.train(x)
    assert mat.is_trained
    return mat.apply_py(x)
예제 #23
0
author  :   h-j-13
time    :   2018-6-20
ref     :   https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization
"""

import numpy
import faiss

numpy.random.seed(13)


# 通过PCA将40维的向量缩减为10维
# =============测试数据=============
mt = numpy.random.rand(1000, 40).astype('float32')
print mt[0]
mat = faiss.PCAMatrix(40, 10)                       # 40维缩减为10维
mat.train(mt)                                       # 训练模型
assert mat.is_trained
tr = mat.apply_py(mt)
# print this to show that the magnitude of tr's columns is decreasing
print tr[0]
# print (tr ** 2).sum(0)

# =============Python Console=============
# [7.7770239e-01 2.3754121e-01 8.2427853e-01 9.6574920e-01 9.7260112e-01
#  4.5344925e-01 6.0904247e-01 7.7552652e-01 6.4161336e-01 7.2201824e-01
#  3.5036523e-02 2.9844946e-01 5.8512490e-02 8.5706097e-01 3.7285402e-01
#  6.7984796e-01 2.5627995e-01 3.4758121e-01 9.4127702e-03 3.5833380e-01
#  9.4909418e-01 2.1789901e-01 3.1939137e-01 9.1777241e-01 3.1903666e-02
#  6.5084539e-02 6.2982899e-01 8.7381345e-01 8.7157320e-03 7.4657726e-01
#  8.1284118e-01 7.5717449e-02 6.5645534e-01 5.0926220e-01 4.7988340e-01
예제 #24
0
if args.nt_sample == 0:
    xt_pca = xt[args.nt:args.nt + 10000]
    xt = xt[:args.nt]
else:
    xt_pca = xt[args.nt_sample:args.nt_sample + 10000]
    rs = np.random.RandomState(args.seed)
    idx = rs.choice(args.nt_sample, size=args.nt, replace=False)
    xt = xt[idx]

xb = xb[:args.nb]

d = xb.shape[1]

if args.pcadim != -1:
    print "training PCA: %d -> %d" % (d, args.pcadim)
    pca = faiss.PCAMatrix(d, args.pcadim)
    pca.train(sanitize(xt_pca))
    xt = pca.apply_py(sanitize(xt))
    xb = pca.apply_py(sanitize(xb))
    d = xb.shape[1]


######################################################
# Run clustering
######################################################


index = faiss.IndexFlatL2(d)

if ngpu > 0:
    print "moving index to GPU"