def test_faiss_index_save_and_load(tmp_path): document_store = FAISSDocumentStore( sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", index="haystack_test", ) document_store.write_documents(DOCUMENTS) # test saving the index document_store.save(tmp_path / "haystack_test_faiss") # clear existing faiss_index document_store.faiss_indexes[document_store.index].reset() # test faiss index is cleared assert document_store.faiss_indexes[document_store.index].ntotal == 0 # test loading the index new_document_store = FAISSDocumentStore.load( sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", faiss_file_path=tmp_path / "haystack_test_faiss", index=document_store.index) # check faiss index is restored assert new_document_store.faiss_indexes[ document_store.index].ntotal == len(DOCUMENTS) # check if documents are restored assert len(new_document_store.get_all_documents()) == len(DOCUMENTS)
# max_seq_len_passage=256, # batch_size=16, # use_gpu=True, # embed_title=True, # use_fast_tokenizers=True) # Get dataframe with columns "question", "answer" and some custom metadata df = pd.read_csv("faq.csv") # Minimal cleaning df.fillna(value="", inplace=True) df["question"] = df["question"].apply(lambda x: x.strip()) print(df.head()) # Get embeddings for our questions from the FAQs # questions = list(df["question"].values) # df["question_emb"] = retriever2.embed_queries(texts=questions) # text is the field to be converted to embeddings df = df.rename(columns={"question": "text"}) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") document_store.delete_all_documents() document_store.write_documents(docs_to_index) retriever2 = EmbeddingRetriever( document_store=document_store, embedding_model="sentence_bert-saved", use_gpu=False) document_store.update_embeddings(retriever2) document_store.save('faiss2')