def setup(self): print("SETTING UP PIPELINE") self.document_store = ElasticsearchDocumentStore( similarity="dot_product", host="elasticsearch", username="", password="", index="document") self.document_store_faiss = FAISSDocumentStore( index="document", faiss_index_factory_str="Flat", return_embedding=True, sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss" ) processor, converter = self.write_as4_docs() table_data = self.write_table_docs(converter, processor) es_retriever = ElasticsearchRetriever( document_store=self.document_store) print("SETTING UP DPR") dpr_retriever = DPRTrainingManager.get_current_retriever( self.document_store_faiss) print("SETTING UP EMBEDDINGS") embedding_retriever = EmbeddingRetriever( document_store=self.document_store_faiss, embedding_model="deepset/sentence_bert" ) query_classifier = QueryClassifier() print("SETTING UP TABLE") table_retriever = TableRetriever(table_data) print("SETUP RETRIEVERS") self.question_generator = FurtherQuestionGenerator() print("UPDATING EMBEDDINGS") self.document_store_faiss.update_embeddings(dpr_retriever) print("UPDATED EMBEDDINGS") self.dpr_node = ContinualDPRNode( dpr_retriever, self.document_store_faiss) result = Result() self.trainer = DPRTrainingManager( self.document_store_faiss, self.dpr_node) print("SETUP COMPONENTS") pipeline = Pipeline() pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) pipeline.add_node(component=self.dpr_node, name="DPRRetriever", inputs=["Query"]) pipeline.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[ "DPRRetriever", "EmbeddingRetriever", "ESRetriever"]) pipeline.add_node(component=query_classifier, name="QueryClassifier", inputs=["JoinResults"]) pipeline.add_node(component=self.question_generator, name="QnGenerator", inputs=["QueryClassifier.output_1"]) pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[ "QueryClassifier.output_2"]) pipeline.add_node(component=result, name="Result", inputs=[ "QnGenerator", "TableRetriever"]) self.pipeline = pipeline print("SETUP PIPELINE")
def faiss_document_store(): if os.path.exists("haystack_test_faiss.db"): os.remove("haystack_test_faiss.db") document_store = FAISSDocumentStore( sql_url="sqlite:///haystack_test_faiss.db", return_embedding=True) yield document_store document_store.faiss_index.reset()
def load(self): if(self.finder and self.finder2): return if(not self.document_store2): self.document_store2 = FAISSDocumentStore.load( sql_url=sqlUrlFAQ, faiss_file_path='faiss2') # save before load in preprocess self.initSql(url=sqlUrlFAQ, document_store=self.document_store2) # else: # reset session # # self.document_store2.session.close() # super( # FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ) if(not self.retriever2): self.retriever2 = EmbeddingRetriever(document_store=self.document_store2, embedding_model="sentence_bert-saved", use_gpu=False) if(not self.finder2): self.finder2 = Finder(reader=None, retriever=self.retriever2) if(not self.document_store): self.document_store = SQLDocumentStore(url=sqlUrl) #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl) self.initSql(url=sqlUrl, document_store=self.document_store) # else: # reset session # # self.document_store.session.close() # super( # FAISSDocumentStore, self.document_store).__init__(url=sqlUrl) # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly?? # document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever if(not self.retriever): self.retriever = TfidfRetriever(document_store=self.document_store) self.reader = FARMReader(model_name_or_path=modelDir, use_gpu=False, no_ans_boost=0) if not self.reader else self.reader # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) self.finder = Finder( self.reader, self.retriever) if not self.finder else self.finder
def get_document_store(document_store_type, embedding_field="embedding"): if document_store_type == "sql": document_store = SQLDocumentStore(url="sqlite://", index="haystack_test") elif document_store_type == "memory": document_store = InMemoryDocumentStore( return_embedding=True, embedding_field=embedding_field, index="haystack_test" ) elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore( index="haystack_test", return_embedding=True, embedding_field=embedding_field ) elif document_store_type == "faiss": document_store = FAISSDocumentStore( sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store elif document_store_type == "milvus": document_store = MilvusDocumentStore( sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store else: raise Exception(f"No document store fixture for '{document_store_type}'") return document_store
def get_document_store(document_store_type, similarity='dot_product'): """ TODO This method is taken from test/conftest.py but maybe should be within Haystack. Perhaps a class method of DocStore that just takes string for type of DocStore""" if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") assert document_store.get_document_count() == 0 elif document_store_type == "memory": document_store = InMemoryDocumentStore() elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="eval_document", similarity=similarity, timeout=3000) elif document_store_type in ("milvus_flat", "milvus_hnsw"): if document_store_type == "milvus_flat": index_type = IndexType.FLAT index_param = None search_param = None elif document_store_type == "milvus_hnsw": index_type = IndexType.HNSW index_param = {"M": 64, "efConstruction": 80} search_param = {"ef": 20} document_store = MilvusDocumentStore(similarity=similarity, index_type=index_type, index_param=index_param, search_param=search_param) assert document_store.get_document_count(index="eval_document") == 0 elif document_store_type in ("faiss_flat", "faiss_hnsw"): if document_store_type == "faiss_flat": index_type = "Flat" elif document_store_type == "faiss_hnsw": index_type = "HNSW" status = subprocess.run(['docker rm -f haystack-postgres'], shell=True) time.sleep(1) status = subprocess.run([ 'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres' ], shell=True) time.sleep(6) status = subprocess.run([ 'docker exec haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"' ], shell=True) time.sleep(1) document_store = FAISSDocumentStore( sql_url="postgresql://*****:*****@localhost:5432/haystack", faiss_index_factory_str=index_type, similarity=similarity) assert document_store.get_document_count() == 0 else: raise Exception( f"No document store fixture for '{document_store_type}'") return document_store
def test_faiss_index_save_and_load(tmp_path): document_store = FAISSDocumentStore( sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", index="haystack_test", ) document_store.write_documents(DOCUMENTS) # test saving the index document_store.save(tmp_path / "haystack_test_faiss") # clear existing faiss_index document_store.faiss_indexes[document_store.index].reset() # test faiss index is cleared assert document_store.faiss_indexes[document_store.index].ntotal == 0 # test loading the index new_document_store = FAISSDocumentStore.load( sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", faiss_file_path=tmp_path / "haystack_test_faiss", index=document_store.index) # check faiss index is restored assert new_document_store.faiss_indexes[ document_store.index].ntotal == len(DOCUMENTS) # check if documents are restored assert len(new_document_store.get_all_documents()) == len(DOCUMENTS)
def get_document_store(document_store_type, embedding_dim=768, embedding_field="embedding"): if document_store_type == "sql": document_store = SQLDocumentStore(url="sqlite://", index="haystack_test") elif document_store_type == "memory": document_store = InMemoryDocumentStore(return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, index="haystack_test") elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore( index="haystack_test", return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field) elif document_store_type == "faiss": document_store = FAISSDocumentStore( vector_dim=embedding_dim, sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store elif document_store_type == "milvus": document_store = MilvusDocumentStore( vector_dim=embedding_dim, sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) _, collections = document_store.milvus_server.list_collections() for collection in collections: if collection.startswith("haystack_test"): document_store.milvus_server.drop_collection(collection) return document_store elif document_store_type == "weaviate": document_store = WeaviateDocumentStore( weaviate_url="http://localhost:8080", index="Haystacktest") document_store.weaviate_client.schema.delete_all() document_store._create_schema_and_index_if_not_exist() return document_store else: raise Exception( f"No document store fixture for '{document_store_type}'") return document_store
def get_document_store(document_store_type, es_similarity='cosine'): """ TODO This method is taken from test/conftest.py but maybe should be within Haystack. Perhaps a class method of DocStore that just takes string for type of DocStore""" if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") elif document_store_type == "memory": document_store = InMemoryDocumentStore() elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="eval_document", similarity=es_similarity) elif document_store_type in ("faiss_flat", "faiss_hnsw"): if document_store_type == "faiss_flat": index_type = "Flat" elif document_store_type == "faiss_hnsw": index_type = "HNSW" #TEMP FIX for issue with deleting docs # status = subprocess.run( # ['docker rm -f haystack-postgres'], # shell=True) # time.sleep(3) # try: # document_store = FAISSDocumentStore(sql_url="postgresql://*****:*****@localhost:5432/haystack", # faiss_index_factory_str=index_type) # except: # Launch a postgres instance & create empty DB # logger.info("Didn't find Postgres. Start a new instance...") status = subprocess.run(['docker rm -f haystack-postgres'], shell=True) time.sleep(1) status = subprocess.run([ 'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres' ], shell=True) time.sleep(3) status = subprocess.run([ 'docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"' ], shell=True) time.sleep(1) document_store = FAISSDocumentStore( sql_url="postgresql://*****:*****@localhost:5432/haystack", faiss_index_factory_str=index_type) else: raise Exception( f"No document store fixture for '{document_store_type}'") assert document_store.get_document_count() == 0 return document_store
def test_faiss_retrieving(index_factory): document_store = FAISSDocumentStore( sql_url="sqlite:///haystack_test_faiss.db", faiss_index_factory_str=index_factory) document_store.delete_all_documents(index="document") if "ivf" in index_factory.lower(): document_store.train_index(DOCUMENTS) document_store.write_documents(DOCUMENTS) retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) result = retriever.retrieve(query="How to test this?") assert len(result) == len(DOCUMENTS) assert type(result[0]) == Document
def test_faiss_passing_index_from_outside(): d = 768 nlist = 2 quantizer = faiss.IndexFlatIP(d) faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) faiss_index.nprobe = 2 document_store = FAISSDocumentStore( sql_url="sqlite:///haystack_test_faiss.db", faiss_index=faiss_index) document_store.delete_all_documents(index="document") # as it is a IVF index we need to train it before adding docs document_store.train_index(DOCUMENTS) document_store.write_documents(documents=DOCUMENTS, index="document") documents_indexed = document_store.get_all_documents(index="document") # test document correctness check_data_correctness(documents_indexed, DOCUMENTS)
def test_faiss_passing_index_from_outside(): d = 768 nlist = 2 quantizer = faiss.IndexFlatIP(d) faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable) faiss_index.nprobe = 2 document_store = FAISSDocumentStore( sql_url="sqlite:///haystack_test_faiss.db", faiss_index=faiss_index) document_store.delete_all_documents(index="document") # as it is a IVF index we need to train it before adding docs document_store.train_index(DOCUMENTS) document_store.write_documents(documents=DOCUMENTS, index="document") documents_indexed = document_store.get_all_documents(index="document") # test if vectors ids are associated with docs for doc in documents_indexed: assert 0 <= int(doc.meta["vector_id"]) <= 7
def test_faiss_index_save_and_load(document_store): document_store.write_documents(DOCUMENTS) # test saving the index document_store.save("haystack_test_faiss") # clear existing faiss_index document_store.faiss_index.reset() # test faiss index is cleared assert document_store.faiss_index.ntotal == 0 # test loading the index new_document_store = FAISSDocumentStore.load( sql_url="sqlite://", faiss_file_path="haystack_test_faiss") # check faiss index is restored assert new_document_store.faiss_index.ntotal == len(DOCUMENTS)
def get_document_store(document_store_type): if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") elif document_store_type == "memory": document_store = InMemoryDocumentStore() elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="haystack_test") elif document_store_type == "faiss": if os.path.exists("haystack_test_faiss.db"): os.remove("haystack_test_faiss.db") document_store = FAISSDocumentStore( sql_url="sqlite:///haystack_test_faiss.db") else: raise Exception( f"No document store fixture for '{document_store_type}'") return document_store
def Find_answer(text_file_path, data_folder_path, symbol, question): document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") with open(text_file_path, 'r', encoding='utf-8') as f: data = f.read() for i, line in enumerate(data.split(symbol)): with open(f'{data_folder_path}/data{i+1}.txt', 'w') as f: print(f'writing file no.{i+1}') f.write(line) test_dicts = convert_files_to_dicts(dir_path=data_folder_path, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(test_dicts) retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=16, use_gpu=True, embed_title=True, use_fast_tokenizers=True) document_store.update_embeddings(retriever) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, context_window_size=300) pipe = ExtractiveQAPipeline(reader, retriever) prediction = pipe.run(query=question, top_k_retriever=10, top_k_reader=3) doc_with_ans = [] for i in range(len(prediction['answers'])): if prediction['answers'][i]['context'] not in doc_with_ans: doc_with_ans.append(prediction['answers'][i]['context']) answer = ' '.join(doc_with_ans) return answer
stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) ) from haystack.document_store.elasticsearch import ElasticsearchDocumentStore document_store_ES = ElasticsearchDocumentStore( similarity="dot_product", host="localhost", username="", password="") from typing import List import requests import pandas as pd from haystack import Document from haystack.document_store.faiss import FAISSDocumentStore # FAISS DocumentStore used for DPR and embedding retriever document_store_FAISS = FAISSDocumentStore( faiss_index_factory_str="Flat", return_embedding=True ) import pprint pp = pprint.PrettyPrinter(indent=2) """## Document Preprocessing""" from haystack.reader.farm import FARMReader import haystack converter = haystack.file_converter.txt.TextConverter( remove_numeric_tables=False, valid_languages=["en"]) as4 = converter.convert(file_path="/content/as4-winterBarley.txt")
import time from elasticsearch import Elasticsearch es_server = Popen(['/home/dr_lunars/elasticsearch-7.0.0/bin/elasticsearch'],stdout=PIPE, stderr=STDOUT) time.sleep(30) es = Elasticsearch("http://localhost:9200", timeout=300, max_retries=10, retry_on_timeout=True) daily_score = 0 # %% # DPR from haystack.document_store.faiss import FAISSDocumentStore document_store = FAISSDocumentStore.load(faiss_file_path="my_faiss", sql_url="sqlite:///my_doc_store.db", index="document") from dpr_inference import DPR model_path = '/home/dr_lunars/models/question_encoder-optimized-quantized.onnx' tokenizer_path = "kykim/bert-kor-base" dpr = DPR( model_path=model_path, tokenizer_path=tokenizer_path, document_store=document_store ) # %% # Reader
# Minimal cleaning df.fillna(value="", inplace=True) print(df.head()) titles = list(df["title"].values) texts = list(df["text"].values) # Create to haystack document format documents: List[Document] = [] for title, text in zip(titles, texts): documents.append(Document(text=text, meta={"name": title or ""})) # Initialize FAISS document store to documents and corresponding index for embeddings # Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True) # Initialize DPR Retriever to encode documents, encode question and query documents retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=False, embed_title=True, ) # Initialize RAG Generator generator = RAGenerator( model_name_or_path="facebook/rag-token-nq", use_gpu=False, top_k_answers=1,
def get_faiss_document_store(): return FAISSDocumentStore( faiss_index_factory_str=hyperparams.faiss_index_factory_str, sql_url=sql_url)
def tutorial7_rag_generator(): # Add documents from which you want generate answers # Download a csv containing some sample documents data # Here some sample documents data temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv") open('small_generator_dataset.csv', 'wb').write(temp.content) # Get dataframe with columns "title", and "text" df = pd.read_csv("small_generator_dataset.csv", sep=',') # Minimal cleaning df.fillna(value="", inplace=True) print(df.head()) titles = list(df["title"].values) texts = list(df["text"].values) # Create to haystack document format documents: List[Document] = [] for title, text in zip(titles, texts): documents.append( Document( text=text, meta={ "name": title or "" } ) ) # Initialize FAISS document store to documents and corresponding index for embeddings # Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding document_store = FAISSDocumentStore( faiss_index_factory_str="Flat", return_embedding=True ) # Initialize DPR Retriever to encode documents, encode question and query documents retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=True, embed_title=True, ) # Initialize RAG Generator generator = RAGenerator( model_name_or_path="facebook/rag-token-nq", use_gpu=True, top_k_answers=1, max_length=200, min_length=2, embed_title=True, num_beams=2, ) # Delete existing documents in documents store document_store.delete_all_documents() # Write documents to document store document_store.write_documents(documents) # Add documents embeddings to index document_store.update_embeddings( retriever=retriever ) # Now ask your questions # We have some sample questions QUESTIONS = [ "who got the first nobel prize in physics", "when is the next deadpool movie being released", "which mode is used for short wave broadcast service", "who is the owner of reading football club", "when is the next scandal episode coming out", "when is the last time the philadelphia won the superbowl", "what is the most current adobe flash player version", "how many episodes are there in dragon ball z", "what is the first step in the evolution of the eye", "where is gall bladder situated in human body", "what is the main mineral in lithium batteries", "who is the president of usa right now", "where do the greasers live in the outsiders", "panda is a national animal of which country", "what is the name of manchester united stadium", ] # Now generate answer for question for question in QUESTIONS: # Retrieve related documents from retriever retriever_results = retriever.retrieve( query=question ) # Now generate answer from question and retrieved documents predicted_result = generator.predict( query=question, documents=retriever_results, top_k=1 ) # Print you answer answers = predicted_result["answers"] print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
}).to_dict(orient='records') # %% # clean data # preprocessing from haystack preprocessor = PreProcessor(clean_empty_lines=True, clean_whitespace=True, clean_header_footer=False, split_by="word", split_length=100, split_respect_sentence_boundary=True, split_overlap=10) nested_docs = [preprocessor.process(d) for d in all_dicts] docs = [d for x in nested_docs for d in x] # %% # start FAISS document store and store docs document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") document_store.write_documents(docs) # %% # initialise storage from haystack.retriever.dense import DensePassageRetriever retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=16, use_gpu=False, embed_title=True,
def tutorial12_lfqa(): """ Document Store: FAISS is a library for efficient similarity search on a cluster of dense vectors. The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood to store the document text and other meta data. The vector embeddings of the text are indexed on a FAISS Index that later is queried for searching answers. The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index """ from haystack.document_store.faiss import FAISSDocumentStore document_store = FAISSDocumentStore(vector_dim=128, faiss_index_factory_str="Flat") """ Cleaning & indexing documents: Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore """ # Let's first get some files that we want to use doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Convert files to dicts dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the dicts containing documents to our DB. document_store.write_documents(dicts) """ Initalize Retriever and Reader/Generator: We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore` """ from haystack.retriever.dense import EmbeddingRetriever retriever = EmbeddingRetriever( document_store=document_store, embedding_model="yjernite/retribert-base-uncased", model_format="retribert") document_store.update_embeddings(retriever) """Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents.""" from haystack.utils import print_answers, print_documents from haystack.pipeline import DocumentSearchPipeline p_retrieval = DocumentSearchPipeline(retriever) res = p_retrieval.run(query="Tell me something about Arya Stark?", top_k_retriever=5) print_documents(res, max_text_len=512) """ Similar to previous Tutorials we now initalize our reader/generator. Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5) """ generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5") """ Pipeline: With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions. You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). """ from haystack.pipeline import GenerativeQAPipeline pipe = GenerativeQAPipeline(generator, retriever) """Voilà! Ask a question!""" query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?" result_1 = pipe.run(query=query_1, top_k_retriever=1) print(f"Query: {query_1}") print(f"Answer: {result_1['answers'][0]}") print() query_2 = "What kind of character does Arya Stark play?" result_2 = pipe.run(query=query_2, top_k_retriever=1) print(f"Query: {query_2}") print(f"Answer: {result_2['answers'][0]}") print() pipe.run(query=query_2, top_k_retriever=1)
class DPRTrainingTester: """ To run with an in memory sqlite database """ document_store = FAISSDocumentStore( faiss_index_factory_str="Flat" ) retreiver = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=16, # use_gpu=True, use_gpu=False, embed_title=True, use_fast_tokenizers=True ) # Loads the test document into the document store def loadDocumentsFromFile(self, knowledgeFilePath): converter = TextConverter( remove_numeric_tables=False, valid_languages=["en"]) processor = PreProcessor( clean_empty_lines=True, clean_whitespace=False, clean_header_footer=True, split_by="passage", split_length=1, split_respect_sentence_boundary=False, split_overlap=0 ) self.trainingFile = knowledgeFilePath loadedFile = converter.convert(knowledgeFilePath) documents = processor.process(loadedFile) for i in range(0, len(documents)): docMetadata = documents[i]['meta'] docMetadata['name'] = knowledgeFilePath docMetadata['doucmentID'] = knowledgeFilePath \ + str(docMetadata['_split_id']) self.document_store.write_documents(documents) backagain = self.document_store.get_all_documents() print("Number of documents loaded", end=": ") print(self.document_store.get_document_count()) def __init__(self, knowledgeFilePath): print("Started DPR Training tester!") # load test document into database print("Loading documents from " + knowledgeFilePath) self.document_store.delete_all_documents() self.loadDocumentsFromFile(knowledgeFilePath) # update dpr embeddings based on initial retreiver print("Performing initial embeddings update") self.document_store.update_embeddings(self.retreiver) # generate a new dprTrainingSet to populate self.trainingSet = DPRTrainingSet(self.document_store, 0) # return document store's id for the response marked correct def get_correct_id(self, responses, correctNum): # correctRespons = responses[correctNum].to_dict() return responses[correctNum].id # return list of document store ids for alternative responses def get_incorrect_ids(self, responses, correctNum): ids = [] for i in range(0, len(responses)): if i == correctNum: continue ids.append(responses[i].id) return ids def askQuestion(self): print("------------------------------") question = input("Enter new question (DONE to finish): ") if question == "": return if question == 'DONE': self.generateTraining() return k = 10 responses = self.retreiver.retrieve(question, top_k=k) print() for i in range(0, k): print(i, end=": ") print(responses[i].text) # print(responses[i]) print() print() correctNum = input("Select correct response (X if none correct): ") if correctNum == "": return if correctNum == 'X': return print() print("------------------------------") self.trainingSet.addItem( question=question, posID=self.get_correct_id(responses, int(correctNum)), negIDs=self.get_incorrect_ids(responses, int(correctNum)) ) # file where all the training stuff is doc_dir = "data/" def generateTraining(self): self.trainingSet.addInBatchNegatives() self.trainingSet.generateJSON(self.trainingFile + "SET.json") print("New training set saved to: " + self.trainingFile + "SET.json") exit(0) def loop(self): self.askQuestion()
class MLPipeline: def __init__(self): self.pipeline = None self.document_store = None self.document_store_faiss = None self.question_generator = None self.doc_lock = RLock() def write_as4_docs(self): converter = file_converter.txt.TextConverter( remove_numeric_tables=False, valid_languages=["en"]) processor = preprocessor.preprocessor.PreProcessor( clean_empty_lines=True, clean_whitespace=False, clean_header_footer=True, split_by="passage", split_length=1, split_respect_sentence_boundary=False, split_overlap=0 ) self.document_store.delete_all_documents(index="document") self.document_store_faiss.delete_all_documents(index="document") for file in [file for file in os.listdir("knowledgeBase/text") if ".txt" in file]: doc = converter.convert(file_path="knowledgeBase/text/" + file) doc_processed = processor.process(doc) for i in range(len(doc_processed)): doc_processed[i]["meta"]["index"] = -1 doc_processed[i]["meta"]["table"] = False doc_processed[i]["meta"]["name"] = file[:-4] self.document_store.write_documents( doc_processed, index="document") self.document_store_faiss.write_documents( doc_processed, index="document") backagain = self.document_store_faiss.get_all_documents() for i in range(0, len(backagain)): print(i) print(":\n") print(backagain[i]) print("---------------") return (processor, converter) def write_table_docs(self, converter, processor): data = [] docs = [] for file in [file for file in os.listdir("knowledgeBase/tables") if ".csv" in file]: with open("knowledgeBase/tables/" + file, mode='r') as infile: reader = csv.reader(infile) new_dict = {row[0]: row[1:] for row in reader} data.append(new_dict) infile.close() with open("knowledgeBase/tables/" + file[:-4] + ".txt", mode='r') as infile: docs.append(infile.read()) infile.close() with open('knowledgeBase/table_text.txt', 'w') as outfile: for item in docs: outfile.write("%s\n\n" % item) outfile.close() # Construct FAISS DocumentStore for table content tables = converter.convert(file_path="knowledgeBase/table_text.txt") tableDocs = processor.process(tables) for i in range(len(tableDocs)): tableDocs[i]["meta"]["index"] = i tableDocs[i]["meta"]["table"] = True tableDocs[i]["meta"]["name"] = " " self.document_store.write_documents(tableDocs, index="document") self.document_store_faiss.write_documents(tableDocs, index="document") return data def re_process_documents(self): with self.doc_lock: self.setup() def setup(self): print("SETTING UP PIPELINE") self.document_store = ElasticsearchDocumentStore( similarity="dot_product", host="elasticsearch", username="", password="", index="document") self.document_store_faiss = FAISSDocumentStore( index="document", faiss_index_factory_str="Flat", return_embedding=True, sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss" ) processor, converter = self.write_as4_docs() table_data = self.write_table_docs(converter, processor) es_retriever = ElasticsearchRetriever( document_store=self.document_store) print("SETTING UP DPR") dpr_retriever = DPRTrainingManager.get_current_retriever( self.document_store_faiss) print("SETTING UP EMBEDDINGS") embedding_retriever = EmbeddingRetriever( document_store=self.document_store_faiss, embedding_model="deepset/sentence_bert" ) query_classifier = QueryClassifier() print("SETTING UP TABLE") table_retriever = TableRetriever(table_data) print("SETUP RETRIEVERS") self.question_generator = FurtherQuestionGenerator() print("UPDATING EMBEDDINGS") self.document_store_faiss.update_embeddings(dpr_retriever) print("UPDATED EMBEDDINGS") self.dpr_node = ContinualDPRNode( dpr_retriever, self.document_store_faiss) result = Result() self.trainer = DPRTrainingManager( self.document_store_faiss, self.dpr_node) print("SETUP COMPONENTS") pipeline = Pipeline() pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) pipeline.add_node(component=self.dpr_node, name="DPRRetriever", inputs=["Query"]) pipeline.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[ "DPRRetriever", "EmbeddingRetriever", "ESRetriever"]) pipeline.add_node(component=query_classifier, name="QueryClassifier", inputs=["JoinResults"]) pipeline.add_node(component=self.question_generator, name="QnGenerator", inputs=["QueryClassifier.output_1"]) pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[ "QueryClassifier.output_2"]) pipeline.add_node(component=result, name="Result", inputs=[ "QnGenerator", "TableRetriever"]) self.pipeline = pipeline print("SETUP PIPELINE") def answer(self, question, history={}): with self.doc_lock: if self.pipeline is None: return "" print(f"USING HISTORY: {history}") self.question_generator.history = history responses = self.pipeline.run( query=self.question_generator.question_parsing(question), top_k_retriever=20) if type(responses) is list: return responses[0] else: return responses def report(self, question): print("Question reported:") if self.trainer is None: print("Trainer is missing!") return [] return self.trainer.processQuestion(question) def processTrainingAction(self, question, choices, correct_num): if self.trainer is None: print("Trainer is missing!") return 0 return self.trainer.processTrainingAction(question, choices, correct_num)
def create_store(): # FAISS is a library for efficient similarity search on a cluster of dense vectors. # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood # to store the document text and other meta data. The vector embeddings of the text are # indexed on a FAISS Index that later is queried for searching answers. document_store = FAISSDocumentStore()
def tutorial6_better_retrieval_via_dpr(): # FAISS is a library for efficient similarity search on a cluster of dense vectors. # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood # to store the document text and other meta data. The vector embeddings of the text are # indexed on a FAISS Index that later is queried for searching answers. # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") # ## Preprocessing of documents # Let's first get some documents that we want to query doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the docs to our DB. document_store.write_documents(dicts) ### Retriever retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=2, use_gpu=True, embed_title=True, use_fast_tokenizers=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. # While this can be a time consuming operation (depending on corpus size), it only needs to be done once. # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. document_store.update_embeddings(retriever) ### Reader # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) ### Pipeline from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
class ManagerTester: """ To run with a seperate postgres database """ # docker run -p 5432:5432 -e POSTGRES_PASSWORD=haystack -d postgres # document_store = FAISSDocumentStore( # faiss_index_factory_str="Flat", # sql_url="postgresql://*****:*****@localhost:5432" # ) """ To run with an in memory sqlite database """ document_store = FAISSDocumentStore( faiss_index_factory_str="Flat" ) retreiver = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=16, # use_gpu=True, use_gpu=False, embed_title=True, use_fast_tokenizers=True ) # Loads the test document into the document store def loadDocumentsFromFile(self, knowledgeFilePath): converter = TextConverter( remove_numeric_tables=False, valid_languages=["en"]) processor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="passage", split_length=1, split_respect_sentence_boundary=False, split_overlap=0 ) loadedFile = converter.convert(knowledgeFilePath) documents = processor.process(loadedFile) for i in range(0, len(documents)): docMetadata = documents[i]['meta'] docMetadata['name'] = knowledgeFilePath docMetadata['doucmentID'] = knowledgeFilePath \ + str(docMetadata['_split_id']) self.document_store.write_documents(documents) backagain = self.document_store.get_all_documents() # for i in range(0,len(backagain)): # print(i) # print(":\n") # print(backagain[i]) # print("---------------") print("Number of documents loaded", end=": ") print(self.document_store.get_document_count()) def __init__(self, knowledgeFilePath): print("Started DPR Training tester!") # load test document into database print("Loading documents from " + knowledgeFilePath) self.document_store.delete_all_documents() self.loadDocumentsFromFile(knowledgeFilePath) # update dpr embeddings based on initial retreiver print("Performing initial embeddings update") self.document_store.update_embeddings(self.retreiver) self.trainingManager = DPRTrainingManager(self.document_store, 0) # return document store's id for the response marked correct def get_correct_id(self, responses, correctNum): # correctRespons = responses[correctNum].to_dict() return responses[correctNum].id # return list of document store ids for alternative responses def get_incorrect_ids(self, responses, correctNum): ids = [] for i in range(0, len(responses)): if i == correctNum: continue ids.append(responses[i].id) return ids def askQuestion(self): print("------------------------------") question = input("Enter new question (T to run training): ") if question == 'T': self.train() return k = 5 responses = self.retreiver.retrieve(question, top_k=k) print() for i in range(0, k): print(i, end=": ") print(responses[i].text) print() print() correctNum = input("Select correct response (X if none correct): ") if correctNum == 'X': return print() print("------------------------------") self.trainingManager.addItem( question=question, posID=self.get_correct_id(responses, int(correctNum)), negIDs=self.get_incorrect_ids(responses, int(correctNum)) ) def train(self): newModel = self.trainingManager.train() self.retreiver = DensePassageRetriever.load( document_store=self.document_store, load_dir=newModel, max_seq_len_query=64, max_seq_len_passage=256, batch_size=16, # use_gpu=True, use_gpu=False, embed_title=True, use_fast_tokenizers=True ) self.document_store.update_embeddings(self.retreiver) def loop(self): self.askQuestion()
config = None with open(configFile) as file: config = yaml.safe_load(file) sqlUrlFAQ = config["sqlUrlFAQ"] model = AdaptiveModel.convert_from_transformers( "deepset/sentence_bert", device="cpu", task_type="embeddings") processor = Processor.convert_from_transformers( "deepset/sentence_bert", task_type="embeddings", max_seq_len=384, doc_stride=128) model.save("sentence_bert-saved") processor.save("sentence_bert-saved") document_store = FAISSDocumentStore(sql_url=sqlUrlFAQ) # from haystack.retriever.dense import DensePassageRetriever # retriever = DensePassageRetriever(document_store=document_store, # query_embedding_model="facebook/dpr-question_encoder-single-nq-base", # passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", # max_seq_len_query=64, # max_seq_len_passage=256, # batch_size=16, # use_gpu=True, # embed_title=True, # use_fast_tokenizers=True) # Get dataframe with columns "question", "answer" and some custom metadata
from haystack.document_store.faiss import FAISSDocumentStore from haystack.preprocessor.cleaning import clean_wiki_text from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.utils import print_answers from haystack.retriever.dense import DensePassageRetriever # FAISS is a library for efficient similarity search on a cluster of dense vectors. # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood # to store the document text and other meta data. The vector embeddings of the text are # indexed on a FAISS Index that later is queried for searching answers. # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") # ## Preprocessing of documents # Let's first get some documents that we want to query doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the docs to our DB. document_store.write_documents(dicts) ### Retriever retriever = DensePassageRetriever(document_store=document_store,
from haystack import Finder from haystack.document_store.faiss import FAISSDocumentStore from haystack.preprocessor.cleaning import clean_wiki_text from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.utils import print_answers from haystack.retriever.dense import DensePassageRetriever # FAISS is a library for efficient similarity search on a cluster of dense vectors. # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood # to store the document text and other meta data. The vector embeddings of the text are # indexed on a FAISS Index that later is queried for searching answers. document_store = FAISSDocumentStore() # ## Preprocessing of documents # Let's first get some documents that we want to query doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the docs to our DB. document_store.write_documents(dicts) ### Retriever retriever = DensePassageRetriever( document_store=document_store,