示例#1
0
    def test_ivf_nprobe(self):
        """Test in case of nprobe > nlist."""
        d = self.xq.shape[1] * 8
        xt, xb, xq = self.xt, self.xb, self.xq

        # nlist = 10
        index = faiss.index_binary_factory(d, "BIVF10")

        # When nprobe >= nlist, it is equivalent to an IndexFlat.

        index.train(xt)
        index.add(xb)
        index.nprobe = 2048
        k = 5

        # test kNN search
        D, I = index.search(xq, k)

        ref_index = faiss.index_binary_factory(d, "BFlat")
        ref_index.add(xb)
        ref_D, ref_I = ref_index.search(xq, k)

        print(D[0], ref_D[0])
        print(I[0], ref_I[0])
        assert np.all(D == ref_D)
        # assert np.all(I == ref_I)  # id may be different

        # test range search
        thresh = 5   # *squared* distance
        lims, D, I = index.range_search(xq, thresh)
        ref_lims, ref_D, ref_I = ref_index.range_search(xq, thresh)
        assert np.all(lims == ref_lims)
        assert np.all(D == ref_D)
    def create(hashes: t.Iterable[PDQ_HASH_TYPE],
               custom_ids: t.Iterable[int] = None) -> "PDQFlatHashIndex":
        """
        Creates a PDQFlatHashIndex for use searching against the provided hashes.

        Parameters
        ----------
        hashes: sequence of PDQ Hashes
            The PDQ hashes to create the index with
        custom_ids: sequence of custom ids for the PDQ Hashes (optional)
            Optional sequence of custom id values to use for the PDQ hashes for any
            method relating to indexes (e.g., hash_at). If provided, the nth item in
            custom_ids will be used as the id for the nth hash in hashes. If not provided
            then the ids for the hashes will be assumed to be their respective index
            in hashes (i.e., the nth hash would have id n, starting from 0).

        Returns
        -------
        a PDQFlatHashIndex of these hashes
        """
        hash_bytes = [binascii.unhexlify(hash) for hash in hashes]
        vectors = list(
            map(lambda h: numpy.frombuffer(h, dtype=numpy.uint8), hash_bytes))
        index = faiss.index_binary_factory(BITS_IN_PDQ, "BFlat")
        if custom_ids != None:
            index = faiss.IndexBinaryIDMap2(index)
            i64_ids = list(map(uint64_to_int64, custom_ids))
            index.add_with_ids(numpy.array(vectors), numpy.array(i64_ids))
        else:
            index.add(numpy.array(vectors))
        return PDQFlatHashIndex(index)
示例#3
0
    def test_binary(self):
        ds = datasets.SyntheticDataset(128, 2000, 2000, 200)

        d = ds.d
        xt = ds.get_train()
        xq = ds.get_queries()
        xb = ds.get_database()

        # define alternative quantizer on the 20 first dims of vectors (will be in float)
        km = faiss.Kmeans(20, 50)
        km.train(xt[:, :20].copy())
        alt_quantizer = km.index

        binarizer = faiss.index_factory(d, "ITQ,LSHt")
        binarizer.train(xt)

        xb_bin = binarizer.sa_encode(xb)
        xq_bin = binarizer.sa_encode(xq)

        index = faiss.index_binary_factory(d, "BIVF200")

        fake_centroids = np.zeros((index.nlist, index.d // 8), dtype="uint8")
        index.quantizer.add(fake_centroids)
        index.is_trained = True

        # add elements xb
        a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel()
        ivf_tools.add_preassigned(index, xb_bin, a)

        # search elements xq, increase nprobe, check 4 first results w/ groundtruth
        prev_inter_perf = 0
        for nprobe in 1, 10, 20:

            index.nprobe = nprobe
            a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
            D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
            inter_perf = (I == ds.get_groundtruth()[:, :4]).sum() / I.size
            self.assertTrue(inter_perf >= prev_inter_perf)
            prev_inter_perf = inter_perf

        # test range search

        index.nprobe = 20

        a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]

        # just to find a reasonable radius
        D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
        radius = int(D.max() + 1)

        lims, DR, IR = ivf_tools.range_search_preassigned(
            index, xq_bin, radius, a)

        # with that radius the k-NN results are a subset of the range search results
        for q in range(len(xq)):
            l0, l1 = lims[q], lims[q + 1]
            self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
        def train_and_get_centroids(override_kmeans_index):
            index = faiss.index_binary_factory(d, b"BIVF10")
            index.verbose = True

            if override_kmeans_index is not None:
                index.clustering_index = override_kmeans_index

            index.train(xt)

            centroids = faiss.downcast_IndexBinary(index.quantizer).xb
            return faiss.vector_to_array(centroids).reshape(-1, d // 8)
示例#5
0
    def test_factory_IVF_HNSW(self):

        index = faiss.index_binary_factory(256, "BIVF1024_BHNSW32")
        assert index.code_size == 32
        assert index.nlist == 1024
示例#6
0
    def test_factory_HNSW(self):

        index = faiss.index_binary_factory(256, "BHNSW32")
        assert index.code_size == 32
示例#7
0
    def test_factory_Flat(self):

        index = faiss.index_binary_factory(16, "BFlat")
        assert index.code_size == 2
示例#8
0
    def test_factory_IVF(self):

        index = faiss.index_binary_factory(16, "BIVF10")
        assert index.invlists is not None
        assert index.nlist == 10
        assert index.code_size == 2
 def __init__(self):
     faiss_index = faiss.IndexBinaryIDMap2(
         faiss.index_binary_factory(BITS_IN_PDQ, "BFlat"))
     super().__init__(faiss_index)
示例#10
0
    def test_factory_1(self):

        index = faiss.index_binary_factory(16, "BIVF10")
        assert index.invlists is not None
示例#11
0
    def build(self, config):
        '''
            build index from scratch
        '''
        operation_method = config.get("index_operation", "new").lower()

        gallery_images, gallery_docs = split_datafile(
            config['data_file'], config['image_root'], config['delimiter'])

        # when remove data in index, do not need extract fatures
        if operation_method != "remove":
            gallery_features = self._extract_features(gallery_images, config)
        assert operation_method in [
            "new", "remove", "append"
        ], "Only append, remove and new operation are supported"

        # vector.index: faiss index file
        # id_map.pkl: use this file to map id to image_doc
        if operation_method in ["remove", "append"]:
            # if remove or append, vector.index and id_map.pkl must exist
            assert os.path.join(
                config["index_dir"], "vector.index"
            ), "The vector.index dose not exist in {} when 'index_operation' is not None".format(
                config["index_dir"])
            assert os.path.join(
                config["index_dir"], "id_map.pkl"
            ), "The id_map.pkl dose not exist in {} when 'index_operation' is not None".format(
                config["index_dir"])
            index = faiss.read_index(
                os.path.join(config["index_dir"], "vector.index"))
            with open(os.path.join(config["index_dir"], "id_map.pkl"),
                      'rb') as fd:
                ids = pickle.load(fd)
            assert index.ntotal == len(ids.keys(
            )), "data number in index is not equal in in id_map"
        else:
            if not os.path.exists(config["index_dir"]):
                os.makedirs(config["index_dir"], exist_ok=True)
            index_method = config.get("index_method", "HNSW32")

            # if IVF method, cal ivf number automaticlly
            if index_method == "IVF":
                index_method = index_method + str(
                    min(int(len(gallery_images) // 8), 65536)) + ",Flat"

            # for binary index, add B at head of index_method
            if config["dist_type"] == "hamming":
                index_method = "B" + index_method

            #dist_type
            dist_type = faiss.METRIC_INNER_PRODUCT if config[
                "dist_type"] == "IP" else faiss.METRIC_L2

            #build index
            if config["dist_type"] == "hamming":
                index = faiss.index_binary_factory(config["embedding_size"],
                                                   index_method)
            else:
                index = faiss.index_factory(config["embedding_size"],
                                            index_method, dist_type)
                index = faiss.IndexIDMap2(index)
            ids = {}

        if config["index_method"] == "HNSW32":
            logger.warning(
                "The HNSW32 method dose not support 'remove' operation")

        if operation_method != "remove":
            # calculate id for new data
            start_id = max(ids.keys()) + 1 if ids else 0
            ids_now = (
                np.arange(0, len(gallery_images)) + start_id).astype(np.int64)

            # only train when new index file
            if operation_method == "new":
                if config["dist_type"] == "hamming":
                    index.add(gallery_features)
                else:
                    index.train(gallery_features)

            if not config["dist_type"] == "hamming":
                index.add_with_ids(gallery_features, ids_now)

            for i, d in zip(list(ids_now), gallery_docs):
                ids[i] = d
        else:
            if config["index_method"] == "HNSW32":
                raise RuntimeError(
                    "The index_method: HNSW32 dose not support 'remove' operation"
                )
            # remove ids in id_map, remove index data in faiss index
            remove_ids = list(
                filter(lambda k: ids.get(k) in gallery_docs, ids.keys()))
            remove_ids = np.asarray(remove_ids)
            index.remove_ids(remove_ids)
            for k in remove_ids:
                del ids[k]

        # store faiss index file and id_map file
        if config["dist_type"] == "hamming":
            faiss.write_index_binary(
                index, os.path.join(config["index_dir"], "vector.index"))
        else:
            faiss.write_index(
                index, os.path.join(config["index_dir"], "vector.index"))

        with open(os.path.join(config["index_dir"], "id_map.pkl"), 'wb') as fd:
            pickle.dump(ids, fd)
示例#12
0
 def test_factory_MultiHash(self):
     index = faiss.index_binary_factory(256, "BHash5x6")
     assert index.b == 6
     assert index.nhash == 5
示例#13
0
 def test_factory_Hash(self):
     index = faiss.index_binary_factory(256, "BHash12")
     assert index.b == 12