def test_get_document_count_only_documents_without_embedding_arg(): documents = [ { "text": "text1", "id": "1", "embedding": np.random.rand(768).astype(np.float32), "meta_field_for_count": "a" }, { "text": "text2", "id": "2", "embedding": np.random.rand(768).astype(np.float64), "meta_field_for_count": "b" }, { "text": "text3", "id": "3", "embedding": np.random.rand(768).astype(np.float32).tolist() }, { "text": "text4", "id": "4", "meta_field_for_count": "b" }, { "text": "text5", "id": "5", "meta_field_for_count": "b" }, { "text": "text6", "id": "6", "meta_field_for_count": "c" }, { "text": "text7", "id": "7", "embedding": np.random.rand(768).astype(np.float64), "meta_field_for_count": "c" }, ] _index: str = "haystack_test_count" document_store = ElasticsearchDocumentStore(index=_index) document_store.delete_documents(index=_index) document_store.write_documents(documents) assert document_store.get_document_count() == 7 assert document_store.get_document_count( only_documents_without_embedding=True) == 3 assert document_store.get_document_count( only_documents_without_embedding=True, filters={"meta_field_for_count": ["c"]}) == 1 assert document_store.get_document_count( only_documents_without_embedding=True, filters={"meta_field_for_count": ["b"]}) == 2
# docker start fd2e31d49ed7f485d35f974594c404090269e20b9dc0ca9543d9c4a5bf626faf # curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_cluster/settings -d '{ "transient": { "cluster.routing.allocation.disk.threshold_enabled": false } }' # curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}' # saving: # docker cp elasticsearch:/usr/share/elasticsearch/data docker_es_save # setup persistence: # sudo mkdir -p $PWD/elasticsearch/data # sudo chmod 777 -R $PWD/elaticsearch/data elastic_ds = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") num_existing_docs = elastic_ds.get_document_count() print('num docs', num_existing_docs) if num_existing_docs == 0: raise Exception("no elastic saved data!!") def write_dicts(elastic, dicts): print('starting to write', len(dicts)) elastic.write_documents(dicts, batch_size=batch_size) import time def get_dicts(): i = 0