def _add():
        milvus = Milvus(**server_config)

        vectors = _generate_vectors(128, 10000)
        print('\n\tPID: {}, insert {} vectors'.format(os.getpid(), 10000))
        status, _ = milvus.insert(_collection_name, vectors)
        if not status.OK():
            print("PID {} insert failed: {}".format(os.getpid(),
                                                    status.message))
        milvus.close()
def validate_insert(_collection_name):
    milvus = Milvus(**server_config)
    milvus.flush([_collection_name])
    status, count = milvus.count_entities(_collection_name)
    assert count == 10 * 10000, "Insert validate fail. Vectors num is not matched."

    # drop collcetion
    print("Drop collection ...")
    milvus.drop_collection(_collection_name)
    milvus.close()
Exemplo n.º 3
0
def create_collection(collection_name):
    client = Milvus(host, str(port))

    status, ok = client.has_collection(collection_name)
    if not ok:
        param = {
            'collection_name': collection_name,
            'dimension': 3,
        }
        client.create_collection(param)
    client.close()
    def _create_collection(_collection_param):
        milvus = Milvus(**server_config)
        status, ok = milvus.has_collection(_collection_name)
        if ok:
            print("Collection {} found, now going to delete it".format(
                _collection_name))
            status = milvus.drop_collection(_collection_name)
            if not status.OK():
                raise Exception("Delete collection error")
            print(
                "delete collection {} successfully!".format(_collection_name))
        time.sleep(5)

        status, ok = milvus.has_collection(_collection_name)
        if ok:
            raise Exception("Delete collection error")

        status = milvus.create_collection(param)
        if not status.OK():
            print("Create collection {} failed".format(_collection_name))
        milvus.close()
Exemplo n.º 5
0
    # create `IVF_PQ` index
    status = client.create_index(collection_name, IndexType.IVF_PQ,
                                 index_param)
    if status.OK():
        print("Create index IVF_PQ successfully\n")
    else:
        print("Create index fail: ", status)

    # select top 10 vectors from inserted as query vectors
    query_vectors = vectors[:10]

    # specify search param
    search_param = {"nprobe": 10}

    # specify topk is 1, search approximate nearest 1 neighbor
    status, result = client.search(collection_name,
                                   2,
                                   query_vectors,
                                   params=search_param)

    if status.OK():
        # show search result
        print("Search successfully. Result:\n", result)
    else:
        print("Search fail")

    # drop collection
    client.drop_collection(collection_name)

    client.close()
Exemplo n.º 6
0
}
milvus.create_collection(param=param)

ivf_param = {'nlist': 16384}
milvus.create_index(collection_name=col_name,
                    index_type=IndexType.IVF_FLAT,
                    params=ivf_param)

vectors = [[random.random() for _ in range(dim)] for _ in range(2000)]
vector_ids = list(range(2000))
_, ids = milvus.insert(collection_name=col_name,
                       records=vectors,
                       ids=vector_ids)
# print(ids)

time.sleep(1)
search_param = {'nprobe': 16}
q_records = [[random.random() for _ in range(dim)] for _ in range(5)]
_, result = milvus.search(collection_name=col_name,
                          query_records=q_records,
                          top_k=2,
                          params=search_param)
# for r in result:
#     print(r)
print(result.id_array)
print(result)

milvus.drop_collection(collection_name=col_name)

milvus.close()
Exemplo n.º 7
0
class MilvusDocumentStore(SQLDocumentStore):
    """
    Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors.
    Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR).
    In contrast to FAISS, Milvus ...
     - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment
     - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index)
     - encapsulates multiple ANN libraries (FAISS, ANNOY ...)

    This class uses Milvus for all vector related storage, processing and querying.
    The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus
    does not allow these data types (yet).

    Usage:
    1. Start a Milvus server (see https://milvus.io/docs/v1.0.0/install_milvus.md)
    2. Init a MilvusDocumentStore in Haystack
    """
    def __init__(
        self,
        sql_url: str = "sqlite:///",
        milvus_url: str = "tcp://localhost:19530",
        connection_pool: str = "SingletonThread",
        index: str = "document",
        vector_dim: int = 768,
        index_file_size: int = 1024,
        similarity: str = "dot_product",
        index_type: IndexType = IndexType.FLAT,
        index_param: Optional[Dict[str, Any]] = None,
        search_param: Optional[Dict[str, Any]] = None,
        update_existing_documents: bool = False,
        return_embedding: bool = False,
        embedding_field: str = "embedding",
        progress_bar: bool = True,
        **kwargs,
    ):
        """
        :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale
                        deployment, Postgres is recommended. If using MySQL then same server can also be used for
                        Milvus metadata. For more details see https://milvus.io/docs/v1.0.0/data_manage.md.
        :param milvus_url: Milvus server connection URL for storing and processing vectors.
                           Protocol, host and port will automatically be inferred from the URL.
                           See https://milvus.io/docs/v1.0.0/install_milvus.md for instructions to start a Milvus instance.
        :param connection_pool: Connection pool type to connect with Milvus server. Default: "SingletonThread".
        :param index: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name").
        :param vector_dim: The embedding vector size. Default: 768.
        :param index_file_size: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB.
         When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment.
         Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one.
         As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048.
         Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory.
         (From https://milvus.io/docs/v1.0.0/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size)
        :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings.
                           'cosine' is recommended for Sentence Transformers, but is not directly supported by Milvus.
                           However, you can normalize your embeddings and use `dot_product` to get the same results.
                           See https://milvus.io/docs/v1.0.0/metric.md?Inner-product-(IP)#floating.
        :param index_type: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy.
                           Some popular options:
                           - FLAT (default): Exact method, slow
                           - IVF_FLAT, inverted file based heuristic, fast
                           - HSNW: Graph based, fast
                           - ANNOY: Tree based, fast
                           See: https://milvus.io/docs/v1.0.0/index.md
        :param index_param: Configuration parameters for the chose index_type needed at indexing time.
                            For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT.
                            See https://milvus.io/docs/v1.0.0/index.md
        :param search_param: Configuration parameters for the chose index_type needed at query time
                             For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT.
                             See https://milvus.io/docs/v1.0.0/index.md
        :param update_existing_documents: Whether to update any existing documents with the same ID when adding
                                          documents. When set as True, any document with an existing ID gets updated.
                                          If set to False, an error is raised if the document ID of the document being
                                          added already exists.
        :param return_embedding: To return document embedding.
        :param embedding_field: Name of field containing an embedding vector.
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        """
        self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool)
        self.vector_dim = vector_dim
        self.index_file_size = index_file_size

        if similarity == "dot_product":
            self.metric_type = MetricType.IP
            self.similarity = similarity
        else:
            raise ValueError(
                "The Milvus document store can currently only support dot_product similarity. "
                "Please set similarity=\"dot_product\"")

        self.index_type = index_type
        self.index_param = index_param or {"nlist": 16384}
        self.search_param = search_param or {"nprobe": 10}
        self.index = index
        self._create_collection_and_index_if_not_exist(self.index)
        self.return_embedding = return_embedding
        self.embedding_field = embedding_field
        self.progress_bar = progress_bar

        super().__init__(url=sql_url,
                         update_existing_documents=update_existing_documents,
                         index=index)

    def __del__(self):
        return self.milvus_server.close()

    def _create_collection_and_index_if_not_exist(
            self,
            index: Optional[str] = None,
            index_param: Optional[Dict[str, Any]] = None):
        index = index or self.index
        index_param = index_param or self.index_param

        status, ok = self.milvus_server.has_collection(collection_name=index)
        if not ok:
            collection_param = {
                'collection_name': index,
                'dimension': self.vector_dim,
                'index_file_size': self.index_file_size,
                'metric_type': self.metric_type
            }

            status = self.milvus_server.create_collection(collection_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Collection creation on Milvus server failed: {status}')

            status = self.milvus_server.create_index(index, self.index_type,
                                                     index_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Index creation on Milvus server failed: {status}')

    def _create_document_field_map(self) -> Dict:
        return {
            self.index: self.embedding_field,
        }

    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None,
                        batch_size: int = 10_000):
        """
        Add new documents to the DocumentStore.

        :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
                                  them right away in Milvus. If not, you can later call update_embeddings() to create & index them.
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return:
        """
        index = index or self.index
        self._create_collection_and_index_if_not_exist(index)
        field_map = self._create_document_field_map()

        if len(documents) == 0:
            logger.warning(
                "Calling DocumentStore.write_documents() with empty list")
            return

        document_objects = [
            Document.from_dict(d, field_map=field_map)
            if isinstance(d, dict) else d for d in documents
        ]

        add_vectors = False if document_objects[0].embedding is None else True

        batched_documents = get_batches_from_generator(document_objects,
                                                       batch_size)
        with tqdm(total=len(document_objects),
                  disable=not self.progress_bar) as progress_bar:
            for document_batch in batched_documents:
                vector_ids = []
                if add_vectors:
                    doc_ids = []
                    embeddings = []
                    for doc in document_batch:
                        doc_ids.append(doc.id)
                        if isinstance(doc.embedding, np.ndarray):
                            embeddings.append(doc.embedding.tolist())
                        elif isinstance(doc.embedding, list):
                            embeddings.append(doc.embedding)
                        else:
                            raise AttributeError(
                                f'Format of supplied document embedding {type(doc.embedding)} is not '
                                f'supported. Please use list or numpy.ndarray')

                    if self.update_existing_documents:
                        existing_docs = super().get_documents_by_id(
                            ids=doc_ids, index=index)
                        self._delete_vector_ids_from_milvus(
                            documents=existing_docs, index=index)

                    status, vector_ids = self.milvus_server.insert(
                        collection_name=index, records=embeddings)
                    if status.code != Status.SUCCESS:
                        raise RuntimeError(
                            f'Vector embedding insertion failed: {status}')

                docs_to_write_in_sql = []
                for idx, doc in enumerate(document_batch):
                    meta = doc.meta
                    if add_vectors:
                        meta["vector_id"] = vector_ids[idx]
                    docs_to_write_in_sql.append(doc)

                super().write_documents(docs_to_write_in_sql, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()

        self.milvus_server.flush([index])
        if self.update_existing_documents:
            self.milvus_server.compact(collection_name=index)

    def update_embeddings(
        self,
        retriever: BaseRetriever,
        index: Optional[str] = None,
        batch_size: int = 10_000,
        update_existing_embeddings: bool = True,
        filters: Optional[Dict[str, List[str]]] = None,
    ):
Exemplo n.º 8
0
class MilvusDBHandler:
    """Milvus DB handler
        This class is intended to abstract the access and communication with external MilvusDB from Executors

        For more information about Milvus:
            - https://github.com/milvus-io/milvus/
    """

    @staticmethod
    def get_index_type(index_type):
        from milvus import IndexType

        return {
            'Flat': IndexType.FLAT,
            'IVF,Flat': IndexType.IVFLAT,
            'IVF,SQ8': IndexType.IVF_SQ8,
            'RNSG': IndexType.RNSG,
            'IVF,SQ8H': IndexType.IVF_SQ8H,
            'IVF,PQ': IndexType.IVF_PQ,
            'HNSW': IndexType.IVF_PQ,
            'Annoy': IndexType.ANNOY
        }.get(index_type, IndexType.FLAT)

    class MilvusDBInserter:
        """Milvus DB Inserter
            This class is an inner class and provides a context manager to insert vectors into Milvus while ensuring
            data is flushed.

            For more information about Milvus:
                - https://github.com/milvus-io/milvus/
        """

        def __init__(self, client, collection_name: str):
            self.logger = get_logger(self.__class__.__name__)
            self.client = client
            self.collection_name = collection_name

        def __enter__(self):
            return self

        def __exit__(self, exc_type, exc_val, exc_tb):
            self.logger.info(f'Sending flush command to Milvus Server for collection: {self.collection_name}')
            self.client.flush([self.collection_name])

        def insert(self, keys: list, vectors: 'np.ndarray'):
            status, _ = self.client.insert(collection_name=self.collection_name, records=vectors, ids=keys)
            if not status.OK():
                self.logger.error('Insert failed: {}'.format(status))
                raise MilvusDBException(status.message)

    def __init__(self, host: str, port: int, collection_name: str):
        """
        Initialize an MilvusDBHandler

        :param host: Host of the Milvus Server
        :param port: Port to connect to the Milvus Server
        :param collection_name: Name of the collection where the Handler will insert and query vectors.
        """
        self.logger = get_logger(self.__class__.__name__)
        self.host = host
        self.port = str(port)
        self.collection_name = collection_name
        self.milvus_client = None

    def __enter__(self):
        return self.connect()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def connect(self):
        from milvus import Milvus
        if self.milvus_client is None or not self.milvus_client.server_status()[0].OK():
            self.logger.info(f'Setting connection to Milvus Server at {self.host}:{self.port}')
            self.milvus_client = Milvus(self.host, self.port)
        return self

    def close(self):
        self.logger.info(f'Closing connection to Milvus Server at {self.host}:{self.port}')
        self.milvus_client.close()

    def insert(self, keys: 'np.ndarray', vectors: 'np.ndarray'):
        with MilvusDBHandler.MilvusDBInserter(self.milvus_client, self.collection_name) as db:
            db.insert(reduce(operator.concat, keys.tolist()), vectors)

    def build_index(self, index_type: str, index_params: dict):
        type = self.get_index_type(index_type)

        self.logger.info(f'Creating index of type: {index_type} at'
                         f' Milvus Server. collection: {self.collection_name} with index params: {index_params}')
        status = self.milvus_client.create_index(self.collection_name, type, index_params)
        if not status.OK():
            self.logger.error('Creating index failed: {}'.format(status))
            raise MilvusDBException(status.message)

    def search(self, query_vectors: 'np.ndarray', top_k: int, search_params: dict = None):
        self.logger.info(f'Querying collection: {self.collection_name} with search params: {search_params}')
        status, results = self.milvus_client.search(collection_name=self.collection_name,
                                                    query_records=query_vectors, top_k=top_k, params=search_params)
        if not status.OK():
            self.logger.error('Querying index failed: {}'.format(status))
            raise MilvusDBException(status.message)
        else:
            return results.distance_array, results.id_array
Exemplo n.º 9
0
class MilvusHelper(BaseVectorSimilarityHelper):
    def __init__(self, _server_url, _server_port, _timeout=10):
        super().__init__()
        self.server_url = _server_url
        self.server_port = _server_port
        self.timeout = _timeout
        self.client = None
        self.metric_type_mapper = {
            VectorMetricType.L2: MetricType.L2,
            VectorMetricType.IP: MetricType.IP,
            VectorMetricType.JACCARD: MetricType.JACCARD,
            VectorMetricType.HAMMING: MetricType.HAMMING,
        }

        self.index_type_mapper = {
            VectorIndexType.FLAT: IndexType.FLAT,
            VectorIndexType.IVFLAT: IndexType.IVFLAT,
            VectorIndexType.IVF_SQ8: IndexType.IVF_SQ8,
            VectorIndexType.RNSG: IndexType.RNSG,
            VectorIndexType.IVF_SQ8H: IndexType.IVF_SQ8H,
            VectorIndexType.IVF_PQ: IndexType.IVF_PQ,
            VectorIndexType.HNSW: IndexType.HNSW,
            VectorIndexType.ANNOY: IndexType.ANNOY,
        }

    def init(self):
        if self.client is None:
            if not (self.server_url is None or self.server_url is None):
                try:
                    self.client = Milvus(host=self.server_url, port=self.server_port)
                except:
                    raise MilvusRuntimeException(f'cannot connect to {self.server_url}:{self.server_port}')
            else:
                raise MilvusRuntimeException('Milvus config is not correct')

    def insert(self, _database_name, _to_insert_vector, _partition_tag=None, _params=None):
        """
        向数据库中插入一系列的特征向量

        notes:如果用户有自己的id,建议使用insert_with_id函数

        ATTENTION!!!

        一个库中不能既调用insert_with_id还调用insert,只能调用一种,否则会报错

        Args:
            _database_name:     数据库名称
            _to_insert_vector:  待插入的特征向量的列表
            _partition_tag:     分区标签
            _params:    插入参数

        Returns:    插入后的id

        """
        self.init()
        status, ids = self.client.insert(_database_name, _to_insert_vector, partition_tag=_partition_tag,
                                         params=_params,
                                         timeout=self.timeout)
        self.flush(_database_name)
        if status.OK():
            return ids
        else:
            raise MilvusRuntimeException(status.message)

    def insert_with_id(self, _database_name, _to_insert_vector, _to_insert_ids, _partition_tag=None, _params=None):
        """
        向数据库中插入一系列的有固定id的特征向量

        ATTENTION!!!

        一个库中不能既调用insert_with_id还调用insert,只能调用一种,否则会报错

        Args:
            _database_name:     数据库名称
            _to_insert_vector:  待插入的特征向量的列表
            _to_insert_ids:  待插入的特征向量的id的列表,每个元素必须为正整数,且不越界
            _partition_tag:     分区标签
            _params:    插入参数

        Returns:    插入后的id

        """
        self.init()
        status, ids = self.client.insert(_database_name, _to_insert_vector,
                                         ids=_to_insert_ids, partition_tag=_partition_tag,
                                         params=_params,
                                         timeout=self.timeout)
        self.flush(_database_name)
        if status.OK():
            return ids
        else:
            raise MilvusRuntimeException(status.message)

    def delete(self, _database_name, _to_delete_ids):
        """
        删除特定id

        Args:
            _database_name:     数据库名称
            _to_delete_ids:     待删除的id

        Returns:    是否删除成功

        """
        self.init()
        status = self.client.delete_entity_by_id(_database_name, _to_delete_ids, self.timeout)
        self.flush(_database_name)
        if status.OK():
            return True
        else:
            raise MilvusRuntimeException(status.message)

    def database_exist(self, _database_name):
        """
        数据库是否存在

        Args:
            _database_name:     数据库名称

        Returns:    是否存在

        """
        self.init()
        status, is_exist = self.client.has_collection(_database_name, self.timeout)
        if status.OK():
            return is_exist
        else:
            raise MilvusRuntimeException(status.message)

    def create_database(self, _database_name, _dimension, _index_file_size, _metric_type):
        """
        创建数据库

        Args:
            _database_name:     数据库名称
            _dimension:     特征向量维度
            _index_file_size:   index的文件大小
            _metric_type:   度量类型

        Returns:    是否创建成功

        """
        self.init()
        if not self.database_exist(_database_name):
            assert _metric_type in self.metric_type_mapper, f'{_metric_type} not support in milvus'
            status = self.client.create_collection({
                'collection_name': _database_name,
                'dimension': _dimension,
                'index_file_size': _index_file_size,
                'metric_type': self.metric_type_mapper[_metric_type]
            })
            if status.OK():
                return True
            else:
                raise MilvusRuntimeException(status.message)
        else:
            return True

    def create_index(self, _database_name, _index_type):
        """
        创建index(索引)

        Args:
            _database_name:     数据库名称
            _index_type:    index类型

        Returns:    是否创建成功

        """
        self.init()
        if self.database_exist(_database_name):
            assert _index_type in self.index_type_mapper, f'{_index_type} not support in milvus'
            status = self.client.create_index(_database_name, self.index_type_mapper[_index_type], timeout=self.timeout)
            if status.OK():
                return True
            else:
                raise MilvusRuntimeException(status.message)

    def search(self, _database_name, _query_vector_list, _top_k, _partition_tag=None, _params=None):
        """
        检索用参数

        Args:
            _database_name:     数据库名称
            _query_vector_list:      检索用的特征向量列表
            _top_k:     top k
            _partition_tag:     分区标签
            _params:    检索参数

        Returns:    检索的结果,包含id和distance

        """
        self.init()
        if self.database_exist(_database_name):
            status, search_result = self.client.search(_database_name, _top_k, _query_vector_list,
                                                       partition_tags=_partition_tag,
                                                       params=_params,
                                                       timeout=self.timeout)
            if status.OK():
                return search_result
            else:
                raise MilvusRuntimeException(status.message)
        else:
            raise DatabaseNotExist(f'{_database_name} not exist')

    def flush(self, _database_name):
        """
        sink数据库

        Args:
            _database_name:     数据库名称

        Returns:    是否flush成功

        """
        self.init()
        status = self.client.flush([_database_name, ], self.timeout)
        if status.OK():
            return True
        else:
            raise MilvusRuntimeException(status.message)

    def __del__(self):
        if self.client is not None:
            self.client.close()
Exemplo n.º 10
0
class MilvusDocumentStore(SQLDocumentStore):
    """
    Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors.
    Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR).
    In contrast to FAISS, Milvus ...
     - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment
     - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index)
     - encapsulates multiple ANN libraries (FAISS, ANNOY ...)

    This class uses Milvus for all vector related storage, processing and querying.
    The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus
    does not allow these data types (yet).

    Usage:
    1. Start a Milvus server (see https://milvus.io/docs/v1.0.0/install_milvus.md)
    2. Init a MilvusDocumentStore in Haystack
    """
    def __init__(
        self,
        sql_url: str = "sqlite:///",
        milvus_url: str = "tcp://localhost:19530",
        connection_pool: str = "SingletonThread",
        index: str = "document",
        vector_dim: int = 768,
        index_file_size: int = 1024,
        similarity: str = "dot_product",
        index_type: IndexType = IndexType.FLAT,
        index_param: Optional[Dict[str, Any]] = None,
        search_param: Optional[Dict[str, Any]] = None,
        return_embedding: bool = False,
        embedding_field: str = "embedding",
        progress_bar: bool = True,
        duplicate_documents: str = 'overwrite',
        **kwargs,
    ):
        """
        :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale
                        deployment, Postgres is recommended. If using MySQL then same server can also be used for
                        Milvus metadata. For more details see https://milvus.io/docs/v1.0.0/data_manage.md.
        :param milvus_url: Milvus server connection URL for storing and processing vectors.
                           Protocol, host and port will automatically be inferred from the URL.
                           See https://milvus.io/docs/v1.0.0/install_milvus.md for instructions to start a Milvus instance.
        :param connection_pool: Connection pool type to connect with Milvus server. Default: "SingletonThread".
        :param index: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name").
        :param vector_dim: The embedding vector size. Default: 768.
        :param index_file_size: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB.
         When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment.
         Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one.
         As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048.
         Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory.
         (From https://milvus.io/docs/v1.0.0/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size)
        :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings.
                           'cosine' is recommended for Sentence Transformers, but is not directly supported by Milvus.
                           However, you can normalize your embeddings and use `dot_product` to get the same results.
                           See https://milvus.io/docs/v1.0.0/metric.md?Inner-product-(IP)#floating.
        :param index_type: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy.
                           Some popular options:
                           - FLAT (default): Exact method, slow
                           - IVF_FLAT, inverted file based heuristic, fast
                           - HSNW: Graph based, fast
                           - ANNOY: Tree based, fast
                           See: https://milvus.io/docs/v1.0.0/index.md
        :param index_param: Configuration parameters for the chose index_type needed at indexing time.
                            For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT.
                            See https://milvus.io/docs/v1.0.0/index.md
        :param search_param: Configuration parameters for the chose index_type needed at query time
                             For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT.
                             See https://milvus.io/docs/v1.0.0/index.md
        :param return_embedding: To return document embedding.
        :param embedding_field: Name of field containing an embedding vector.
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        :param duplicate_documents: Handle duplicates document based on parameter options.
                                    Parameter options : ( 'skip','overwrite','fail')
                                    skip: Ignore the duplicates documents
                                    overwrite: Update any existing documents with the same ID when adding documents.
                                    fail: an error is raised if the document ID of the document being added already
                                    exists.
        """

        # save init parameters to enable export of component config as YAML
        self.set_config(
            sql_url=sql_url,
            milvus_url=milvus_url,
            connection_pool=connection_pool,
            index=index,
            vector_dim=vector_dim,
            index_file_size=index_file_size,
            similarity=similarity,
            index_type=index_type,
            index_param=index_param,
            search_param=search_param,
            duplicate_documents=duplicate_documents,
            return_embedding=return_embedding,
            embedding_field=embedding_field,
            progress_bar=progress_bar,
        )

        self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool)
        self.vector_dim = vector_dim
        self.index_file_size = index_file_size

        if similarity == "dot_product":
            self.metric_type = MetricType.IP
            self.similarity = similarity
        elif similarity == "l2":
            self.metric_type = MetricType.L2
            self.similarity = similarity
        else:
            raise ValueError(
                "The Milvus document store can currently only support dot_product and L2 similarity. "
                "Please set similarity=\"dot_product\" or \"l2\"")

        self.index_type = index_type
        self.index_param = index_param or {"nlist": 16384}
        self.search_param = search_param or {"nprobe": 10}
        self.index = index
        self._create_collection_and_index_if_not_exist(self.index)
        self.return_embedding = return_embedding
        self.embedding_field = embedding_field
        self.progress_bar = progress_bar
        self.duplicate_documents = duplicate_documents

        super().__init__(url=sql_url, index=index)

    def __del__(self):
        return self.milvus_server.close()

    def _create_collection_and_index_if_not_exist(
            self,
            index: Optional[str] = None,
            index_param: Optional[Dict[str, Any]] = None):
        index = index or self.index
        index_param = index_param or self.index_param

        status, ok = self.milvus_server.has_collection(collection_name=index)
        if not ok:
            collection_param = {
                'collection_name': index,
                'dimension': self.vector_dim,
                'index_file_size': self.index_file_size,
                'metric_type': self.metric_type
            }

            status = self.milvus_server.create_collection(collection_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Collection creation on Milvus server failed: {status}')

            status = self.milvus_server.create_index(index, self.index_type,
                                                     index_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Index creation on Milvus server failed: {status}')

    def _create_document_field_map(self) -> Dict:
        return {
            self.index: self.embedding_field,
        }

    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None,
                        batch_size: int = 10_000,
                        duplicate_documents: Optional[str] = None,
                        index_param: Optional[Dict[str, Any]] = None):
Exemplo n.º 11
0
class Indexer:
    '''
    索引器。
    '''
    def __init__(self, name, host='127.0.0.1', port='19531'):
        '''
        初始化。
        '''
        self.client = Milvus(host=host, port=port)
        self.collection = name

    def init(self, lenient=False):
        '''
        创建集合。
        '''
        if lenient:
            status, result = self.client.has_collection(
                collection_name=self.collection)
            if status.code != 0:
                raise ExertMilvusException(status)
            if result:
                return

        status = self.client.create_collection({
            'collection_name': self.collection,
            'dimension': 512,
            'index_file_size': 1024,
            'metric_type': MetricType.L2
        })
        if status.code != 0 and not (lenient and status.code == 9):
            raise ExertMilvusException(status)

        # 创建索引。
        status = self.client.create_index(collection_name=self.collection,
                                          index_type=IndexType.IVF_FLAT,
                                          params={'nlist': 16384})
        if status.code != 0:
            raise ExertMilvusException(status)

        return status

    def drop(self):
        '''
        删除集合。
        '''
        status = self.client.drop_collection(collection_name=self.collection)
        if status.code != 0:
            raise ExertMilvusException(status)

    def flush(self):
        '''
        写入到硬盘。
        '''
        status = self.client.flush([self.collection])
        if status.code != 0:
            raise ExertMilvusException(status)

    def compact(self):
        '''
        压缩集合。
        '''
        status = self.client.compact(collection_name=self.collection)
        if status.code != 0:
            raise ExertMilvusException(status)

    def close(self):
        '''
        关闭链接。
        '''
        self.client.close()

    def new_tag(self, tag):
        '''
        建分块标签。
        '''
        status = self.client.create_partition(collection_name=self.collection,
                                              partition_tag=tag)
        if status.code != 0:
            raise ExertMilvusException(status)

    def list_tag(self):
        '''
        列举分块标签。
        '''
        status, result = self.client.list_partitions(
            collection_name=self.collection)
        if status.code != 0:
            raise ExertMilvusException(status)
        return result

    def drop_tag(self, tag):
        '''
        删除分块标签。
        '''
        status = self.client.drop_partition(collection_name=self.collection,
                                            partition_tag=tag)
        if status.code != 0:
            raise ExertMilvusException(status)

    def index(self, vectors, tag=None, ids=None):
        '''
        添加索引
        '''
        params = {}
        if tag != None:
            params['tag'] = tag
        if ids != None:
            params['ids'] = ids
        status, result = self.client.insert(collection_name=self.collection,
                                            records=vectors,
                                            **params)
        if status.code != 0:
            raise ExertMilvusException(status)

        return result

    def listing(self, ids):
        '''
        列举信息。
        '''
        status, result = self.client.get_entity_by_id(
            collection_name=self.collection, ids=ids)
        if status.code != 0:
            raise ExertMilvusException(status)
        return result

    def counting(self):
        '''
        计算索引数。
        '''
        status, result = self.client.count_entities(
            collection_name=self.collection)
        if status.code != 0:
            raise ExertMilvusException(status)
        return result

    def unindex(self, ids):
        '''
        去掉索引。
        '''
        status = self.client.delete_entity_by_id(
            collection_name=self.collection, id_array=ids)
        if status.code != 0:
            raise ExertMilvusException(status)

    def search(self, vectors, top_count=100, tags=None):
        '''
        搜索。
        '''
        params = {'params': {'nprobe': 16}}
        if tags != None:
            params['partition_tags'] = tags
        status, results = self.client.search(collection_name=self.collection,
                                             query_records=vectors,
                                             top_k=top_count,
                                             **params)
        if status.code != 0:
            raise ExertMilvusException(status)
        return results
Exemplo n.º 12
0
def main():
    milvus = Milvus(uri=uri)
    param = {
        'collection_name': collection_name,
        'dimension': _DIM,
        'index_file_size': 32,
        #'metric_type': MetricType.IP
        'metric_type': MetricType.L2
    }
    # show collections in Milvus server
    _, collections = milvus.list_collections()

    # 创建 collection
    milvus.create_collection(param)
    # 创建 collection partion
    milvus.create_partition(collection_name, partition_tag)

    print(f'collections in Milvus: {collections}')
    # Describe demo_collection
    _, collection = milvus.get_collection_info(collection_name)
    print(f'descript demo_collection: {collection}')

    # build fake vectors
    vectors = [[random.random() for _ in range(_DIM)] for _ in range(10)]
    vectors1 = [[random.random() for _ in range(_DIM)] for _ in range(10)]

    status, id = milvus.insert(collection_name=collection_name,
                               records=vectors,
                               ids=list(range(10)),
                               partition_tag=partition_tag)
    print(f'status: {status} | id: {id}')
    if not status.OK():
        print(f"insert failded: {status}")

    status1, id1 = milvus.insert(collection_name=collection_name,
                                 records=vectors1,
                                 ids=list(range(10, 20)),
                                 partition_tag=partition_tag)
    print(f'status1: {status1} | id1: {id1}')

    ids_deleted = list(range(10))

    status_delete = milvus.delete_entity_by_id(collection_name=collection_name,
                                               id_array=ids_deleted)
    if status_delete.OK():
        print(f'delete successful')

    # Flush collection insered data to disk
    milvus.flush([collection_name])
    # Get demo_collection row count
    status, result = milvus.count_entities(collection_name)
    print(f"demo_collection row count: {result}")

    # Obtain raw vectors by providing vector ids
    status, result_vectors = milvus.get_entity_by_id(collection_name,
                                                     list(range(10, 20)))

    # create index of vectors, search more repidly
    index_param = {'nlist': 2}

    # create ivflat index in demo_collection
    status = milvus.create_index(collection_name, IndexType.IVF_FLAT,
                                 index_param)
    if status.OK():
        print(f"create index ivf_flat succeeed")

    # use the top 10 vectors for similarity search
    query_vectors = vectors1[0:2]

    # execute vector similariy search
    search_param = {"nprobe": 16}

    param = {
        'collection_name': collection_name,
        'query_records': query_vectors,
        'top_k': 1,
        'params': search_param
    }

    status, results = milvus.search(**param)
    if status.OK():
        if results[0][0].distance == 0.0:
            print('query result is correct')
        else:
            print('not correct')
        print(results)
    else:
        print(f'search failed: {status}')

    # 清除已经存在的collection
    milvus.drop_collection(collection_name=collection_name)

    milvus.close()
Exemplo n.º 13
0
from milvus import Milvus, DataType
from pprint import pprint

# host = '14.241.120.239'
# port = '11037'

host = "192.168.111.133"
port = "11037"

milvus_client = Milvus(host, port, name="facial_recognition2")
milvus_client.close()
milvus_client = Milvus(host, port, name="facial_recognition2")

collection_bodies = 'bodies'
collection_faces = 'faces'
partition_identities = 'identities'
partition_objects = 'objects'


def define_collection_param_faces():
    collection_param_faces = {
        "fields": [
            {
                "name": "head_pose_range",
                "type": DataType.INT32
            },
            {
                "name": "facial_vector",
                "type": DataType.FLOAT_VECTOR,
                "params": {
                    "dim": 512
Exemplo n.º 14
0
class MilvusDocumentStore(SQLDocumentStore):
    """
    Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors.
    Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR).
    In contrast to FAISS, Milvus ...
     - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment
     - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index)
     - encapsulates multiple ANN libraries (FAISS, ANNOY ...)

    This class uses Milvus for all vector related storage, processing and querying.
    The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus
    does not allow these data types (yet).

    Usage:
    1. Start a Milvus server (see https://milvus.io/docs/v0.10.5/install_milvus.md)
    2. Init a MilvusDocumentStore in Haystack
    """
    def __init__(
        self,
        sql_url: str = "sqlite:///",
        milvus_url: str = "tcp://localhost:19530",
        connection_pool: str = "SingletonThread",
        index: str = "document",
        vector_dim: int = 768,
        index_file_size: int = 1024,
        similarity: str = "dot_product",
        index_type: IndexType = IndexType.FLAT,
        index_param: Optional[Dict[str, Any]] = None,
        search_param: Optional[Dict[str, Any]] = None,
        update_existing_documents: bool = False,
        return_embedding: bool = False,
        embedding_field: str = "embedding",
        **kwargs,
    ):
        """
        :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale
                        deployment, Postgres is recommended. If using MySQL then same server can also be used for
                        Milvus metadata. For more details see https://milvus.io/docs/v0.10.5/data_manage.md.
        :param milvus_url: Milvus server connection URL for storing and processing vectors.
                           Protocol, host and port will automatically be inferred from the URL.
                           See https://milvus.io/docs/v0.10.5/install_milvus.md for instructions to start a Milvus instance.
        :param connection_pool: Connection pool type to connect with Milvus server. Default: "SingletonThread".
        :param index: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name").
        :param vector_dim: The embedding vector size. Default: 768.
        :param index_file_size: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB.
         When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment.
         Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one.
         As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048.
         Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory.
         (From https://milvus.io/docs/v0.10.5/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size)
        :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings.
                           'cosine' is recommended for Sentence Transformers, but is not directly supported by Milvus.
                           However, you can normalize your embeddings and use `dot_product` to get the same results.
                           See https://milvus.io/docs/v0.10.5/metric.md?Inner-product-(IP)#floating.
        :param index_type: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy.
                           Some popular options:
                           - FLAT (default): Exact method, slow
                           - IVF_FLAT, inverted file based heuristic, fast
                           - HSNW: Graph based, fast
                           - ANNOY: Tree based, fast
                           See: https://milvus.io/docs/v0.10.5/index.md
        :param index_param: Configuration parameters for the chose index_type needed at indexing time.
                            For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT.
                            See https://milvus.io/docs/v0.10.5/index.md
        :param search_param: Configuration parameters for the chose index_type needed at query time
                             For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT.
                             See https://milvus.io/docs/v0.10.5/index.md
        :param update_existing_documents: Whether to update any existing documents with the same ID when adding
                                          documents. When set as True, any document with an existing ID gets updated.
                                          If set to False, an error is raised if the document ID of the document being
                                          added already exists.
        :param return_embedding: To return document embedding.
        :param embedding_field: Name of field containing an embedding vector.
        """
        self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool)
        self.vector_dim = vector_dim
        self.index_file_size = index_file_size

        if similarity == "dot_product":
            self.metric_type = MetricType.L2
        else:
            raise ValueError(
                "The Milvus document store can currently only support dot_product similarity. "
                "Please set similarity=\"dot_product\"")

        self.index_type = index_type
        self.index_param = index_param or {"nlist": 16384}
        self.search_param = search_param or {"nprobe": 10}
        self.index = index
        self._create_collection_and_index_if_not_exist(self.index)
        self.return_embedding = return_embedding
        self.embedding_field = embedding_field

        super().__init__(url=sql_url,
                         update_existing_documents=update_existing_documents,
                         index=index)

    def __del__(self):
        return self.milvus_server.close()

    def _create_collection_and_index_if_not_exist(
            self,
            index: Optional[str] = None,
            index_param: Optional[Dict[str, Any]] = None):
        index = index or self.index
        index_param = index_param or self.index_param

        status, ok = self.milvus_server.has_collection(collection_name=index)
        if not ok:
            collection_param = {
                'collection_name': index,
                'dimension': self.vector_dim,
                'index_file_size': self.index_file_size,
                'metric_type': self.metric_type
            }

            status = self.milvus_server.create_collection(collection_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Collection creation on Milvus server failed: {status}')

            status = self.milvus_server.create_index(index, self.index_type,
                                                     index_param)
            if status.code != Status.SUCCESS:
                raise RuntimeError(
                    f'Index creation on Milvus server failed: {status}')

    def _create_document_field_map(self) -> Dict:
        return {
            self.index: self.embedding_field,
        }

    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None,
                        batch_size: int = 10_000):
        """
        Add new documents to the DocumentStore.

        :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index
                                  them right away in Milvus. If not, you can later call update_embeddings() to create & index them.
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return:
        """
        index = index or self.index
        self._create_collection_and_index_if_not_exist(index)
        field_map = self._create_document_field_map()

        if len(documents) == 0:
            logger.warning(
                "Calling DocumentStore.write_documents() with empty list")
            return

        document_objects = [
            Document.from_dict(d, field_map=field_map)
            if isinstance(d, dict) else d for d in documents
        ]

        add_vectors = False if document_objects[0].embedding is None else True

        batched_documents = get_batches_from_generator(document_objects,
                                                       batch_size)
        with tqdm(total=len(document_objects)) as progress_bar:
            for document_batch in batched_documents:
                vector_ids = []
                if add_vectors:
                    doc_ids = []
                    embeddings = []
                    for doc in document_batch:
                        doc_ids.append(doc.id)
                        if isinstance(doc.embedding, np.ndarray):
                            embeddings.append(doc.embedding.tolist())
                        elif isinstance(doc.embedding, list):
                            embeddings.append(doc.embedding)
                        else:
                            raise AttributeError(
                                f'Format of supplied document embedding {type(doc.embedding)} is not '
                                f'supported. Please use list or numpy.ndarray')

                    if self.update_existing_documents:
                        existing_docs = super().get_documents_by_id(
                            ids=doc_ids, index=index)
                        self._delete_vector_ids_from_milvus(
                            documents=existing_docs, index=index)

                    status, vector_ids = self.milvus_server.insert(
                        collection_name=index, records=embeddings)
                    if status.code != Status.SUCCESS:
                        raise RuntimeError(
                            f'Vector embedding insertion failed: {status}')

                docs_to_write_in_sql = []
                for idx, doc in enumerate(document_batch):
                    meta = doc.meta
                    if add_vectors:
                        meta["vector_id"] = vector_ids[idx]
                    docs_to_write_in_sql.append(doc)

                super().write_documents(docs_to_write_in_sql, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()

        self.milvus_server.flush([index])
        if self.update_existing_documents:
            self.milvus_server.compact(collection_name=index)

    def update_embeddings(self,
                          retriever: BaseRetriever,
                          index: Optional[str] = None,
                          batch_size: int = 10_000):
        """
        Updates the embeddings in the the document store using the encoding model specified in the retriever.
        This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).

        :param retriever: Retriever to use to get embeddings for text
        :param index: (SQL) index name for storing the docs and metadata
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        :return: None
        """
        index = index or self.index
        self._create_collection_and_index_if_not_exist(index)

        document_count = self.get_document_count(index=index)
        if document_count == 0:
            logger.warning(
                "Calling DocumentStore.update_embeddings() on an empty index")
            return

        logger.info(f"Updating embeddings for {document_count} docs...")

        result = self.get_all_documents_generator(index=index,
                                                  batch_size=batch_size,
                                                  return_embedding=False)
        batched_documents = get_batches_from_generator(result, batch_size)
        with tqdm(total=document_count) as progress_bar:
            for document_batch in batched_documents:
                self._delete_vector_ids_from_milvus(documents=document_batch,
                                                    index=index)

                embeddings = retriever.embed_passages(
                    document_batch)  # type: ignore
                embeddings_list = [
                    embedding.tolist() for embedding in embeddings
                ]
                assert len(document_batch) == len(embeddings_list)

                status, vector_ids = self.milvus_server.insert(
                    collection_name=index, records=embeddings_list)
                if status.code != Status.SUCCESS:
                    raise RuntimeError(
                        f'Vector embedding insertion failed: {status}')

                vector_id_map = {}
                for vector_id, doc in zip(vector_ids, document_batch):
                    vector_id_map[doc.id] = vector_id

                self.update_vector_ids(vector_id_map, index=index)
                progress_bar.update(batch_size)
        progress_bar.close()

        self.milvus_server.flush([index])
        self.milvus_server.compact(collection_name=index)

    def query_by_embedding(
            self,
            query_emb: np.array,
            filters: Optional[dict] = None,
            top_k: int = 10,
            index: Optional[str] = None,
            return_embedding: Optional[bool] = None) -> List[Document]:
        """
        Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.

        :param query_emb: Embedding of the query (e.g. gathered from DPR)
        :param filters: Optional filters to narrow down the search space.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param top_k: How many documents to return
        :param index: (SQL) index name for storing the docs and metadata
        :param return_embedding: To return document embedding
        :return:
        """
        if filters:
            raise Exception(
                "Query filters are not implemented for the MilvusDocumentStore."
            )

        index = index or self.index
        status, ok = self.milvus_server.has_collection(collection_name=index)
        if status.code != Status.SUCCESS:
            raise RuntimeError(f'Milvus has collection check failed: {status}')
        if not ok:
            raise Exception(
                "No index exists. Use 'update_embeddings()` to create an index."
            )

        if return_embedding is None:
            return_embedding = self.return_embedding
        index = index or self.index

        query_emb = query_emb.reshape(1, -1).astype(np.float32)
        status, search_result = self.milvus_server.search(
            collection_name=index,
            query_records=query_emb,
            top_k=top_k,
            params=self.search_param)
        if status.code != Status.SUCCESS:
            raise RuntimeError(f'Vector embedding search failed: {status}')

        vector_ids_for_query = []
        scores_for_vector_ids: Dict[str, float] = {}
        for vector_id_list, distance_list in zip(search_result.id_array,
                                                 search_result.distance_array):
            for vector_id, distance in zip(vector_id_list, distance_list):
                vector_ids_for_query.append(str(vector_id))
                scores_for_vector_ids[str(vector_id)] = distance

        documents = self.get_documents_by_vector_ids(vector_ids_for_query,
                                                     index=index)

        if return_embedding:
            self._populate_embeddings_to_docs(index=index, docs=documents)

        for doc in documents:
            doc.score = scores_for_vector_ids[doc.meta["vector_id"]]
            doc.probability = float(expit(np.asarray(doc.score / 100)))

        return documents

    def delete_all_documents(self,
                             index: Optional[str] = None,
                             filters: Optional[Dict[str, List[str]]] = None):
        """
        Delete all documents (from SQL AND Milvus).
        :param index: (SQL) index name for storing the docs and metadata
        :param filters: Optional filters to narrow down the search space.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :return: None
        """
        index = index or self.index
        super().delete_all_documents(index=index, filters=filters)
        status, ok = self.milvus_server.has_collection(collection_name=index)
        if status.code != Status.SUCCESS:
            raise RuntimeError(f'Milvus has collection check failed: {status}')
        if ok:
            status = self.milvus_server.drop_collection(collection_name=index)
            if status.code != Status.SUCCESS:
                raise RuntimeError(f'Milvus drop collection failed: {status}')

            self.milvus_server.flush([index])
            self.milvus_server.compact(collection_name=index)

    def get_all_documents_generator(
        self,
        index: Optional[str] = None,
        filters: Optional[Dict[str, List[str]]] = None,
        return_embedding: Optional[bool] = None,
        batch_size: int = 10_000,
    ) -> Generator[Document, None, None]:
        """
        Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
        document store and yielded as individual documents. This method can be used to iteratively process
        a large number of documents without having to load all documents in memory.

        :param index: Name of the index to get the documents from. If None, the
                      DocumentStore's default index (self.index) will be used.
        :param filters: Optional filters to narrow down the documents to return.
                        Example: {"name": ["some", "more"], "category": ["only_one"]}
        :param return_embedding: Whether to return the document embeddings.
        :param batch_size: When working with large number of documents, batching can help reduce memory footprint.
        """
        index = index or self.index
        documents = super().get_all_documents_generator(index=index,
                                                        filters=filters,
                                                        batch_size=batch_size)
        if return_embedding is None:
            return_embedding = self.return_embedding

        for doc in documents:
            if return_embedding:
                self._populate_embeddings_to_docs(index=index, docs=[doc])
            yield doc