def test_get_document_count_only_documents_without_embedding_arg(): documents = [ { "text": "text1", "id": "1", "embedding": np.random.rand(768).astype(np.float32), "meta_field_for_count": "a" }, { "text": "text2", "id": "2", "embedding": np.random.rand(768).astype(np.float64), "meta_field_for_count": "b" }, { "text": "text3", "id": "3", "embedding": np.random.rand(768).astype(np.float32).tolist() }, { "text": "text4", "id": "4", "meta_field_for_count": "b" }, { "text": "text5", "id": "5", "meta_field_for_count": "b" }, { "text": "text6", "id": "6", "meta_field_for_count": "c" }, { "text": "text7", "id": "7", "embedding": np.random.rand(768).astype(np.float64), "meta_field_for_count": "c" }, ] _index: str = "haystack_test_count" document_store = ElasticsearchDocumentStore(index=_index) document_store.delete_documents(index=_index) document_store.write_documents(documents) assert document_store.get_document_count() == 7 assert document_store.get_document_count( only_documents_without_embedding=True) == 3 assert document_store.get_document_count( only_documents_without_embedding=True, filters={"meta_field_for_count": ["c"]}) == 1 assert document_store.get_document_count( only_documents_without_embedding=True, filters={"meta_field_for_count": ["b"]}) == 2
def test_elasticsearch_custom_fields(elasticsearch_fixture): client = Elasticsearch() client.indices.delete(index='haystack_test_custom', ignore=[404]) document_store = ElasticsearchDocumentStore(index="haystack_test_custom", text_field="custom_text_field", embedding_field="custom_embedding_field") doc_to_write = {"custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)} document_store.write_documents([doc_to_write]) documents = document_store.get_all_documents(return_embedding=True) assert len(documents) == 1 assert documents[0].text == "test" np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
def update_elastic_embeddings(document_store: ElasticsearchDocumentStore, retriever: BaseRetriever, update_existing=False): index = document_store.index result = document_store.get_all_documents_generator(index) for document_batch in get_batches_from_generator(result, 10_000): if len(document_batch) == 0: break if not update_existing: # take only documents with no embeddings document_batch = [d for d in document_batch if d.embedding is None] if len(document_batch) == 0: continue embeddings = retriever.embed_passages(document_batch) # type: ignore assert len(document_batch) == len(embeddings) print('updating ', len(document_batch), ' embeddings') doc_updates = [] for doc, emb in zip(document_batch, embeddings): update = { "_op_type": "update", "_index": index, "_id": doc.id, "doc": { document_store.embedding_field: emb.tolist() }, } doc_updates.append(update) bulk(document_store.client, doc_updates, request_timeout=300, refresh=document_store.refresh_type)
def __init__(self): self.finder = Finder(reader=Reader(model_name_or_path=MODEL_PATH, tokenizer=MODEL_PATH, use_gpu=0), retriever=ElasticsearchRetriever( document_store=ElasticsearchDocumentStore( refresh_type='false')))
def get_elastic_document_store(): def is_first_run(): existing_images = os.popen('docker images').read() return 'elasticsearch' in existing_images and '7.9.2' in existing_images def create_image_and_volume(): os.popen('mkdir -m777 -p elasticsearch/data') os.popen( 'docker run -d -p 9200:9200 -e "discovery.type=single-node" -v $PWD/elasticsearch/data:/usr/share/elasticsearch/data --name elasticsearch elasticsearch:7.9.2' ) if is_first_run(): create_image_and_volume() print('starting elastic docker') if 'elasticsearch' not in os.popen('docker ps').read(): os.popen("""docker start %s""" % elastic_docker_id) time.sleep(25) print( os.popen( """curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_cluster/settings -d '{ "transient": { "cluster.routing.allocation.disk.threshold_enabled": false } }'""" )) print( os.popen( """curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'""" )) time.sleep(5) elastic_ds = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", return_embedding=True) return elastic_ds
def get_document_store(document_store_type, embedding_field="embedding"): if document_store_type == "sql": document_store = SQLDocumentStore(url="sqlite://", index="haystack_test") elif document_store_type == "memory": document_store = InMemoryDocumentStore( return_embedding=True, embedding_field=embedding_field, index="haystack_test" ) elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore( index="haystack_test", return_embedding=True, embedding_field=embedding_field ) elif document_store_type == "faiss": document_store = FAISSDocumentStore( sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store elif document_store_type == "milvus": document_store = MilvusDocumentStore( sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store else: raise Exception(f"No document store fixture for '{document_store_type}'") return document_store
def get_document_store(document_store_type, similarity='dot_product'): """ TODO This method is taken from test/conftest.py but maybe should be within Haystack. Perhaps a class method of DocStore that just takes string for type of DocStore""" if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") assert document_store.get_document_count() == 0 elif document_store_type == "memory": document_store = InMemoryDocumentStore() elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="eval_document", similarity=similarity, timeout=3000) elif document_store_type in ("milvus_flat", "milvus_hnsw"): if document_store_type == "milvus_flat": index_type = IndexType.FLAT index_param = None search_param = None elif document_store_type == "milvus_hnsw": index_type = IndexType.HNSW index_param = {"M": 64, "efConstruction": 80} search_param = {"ef": 20} document_store = MilvusDocumentStore(similarity=similarity, index_type=index_type, index_param=index_param, search_param=search_param) assert document_store.get_document_count(index="eval_document") == 0 elif document_store_type in ("faiss_flat", "faiss_hnsw"): if document_store_type == "faiss_flat": index_type = "Flat" elif document_store_type == "faiss_hnsw": index_type = "HNSW" status = subprocess.run(['docker rm -f haystack-postgres'], shell=True) time.sleep(1) status = subprocess.run([ 'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres' ], shell=True) time.sleep(6) status = subprocess.run([ 'docker exec haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"' ], shell=True) time.sleep(1) document_store = FAISSDocumentStore( sql_url="postgresql://*****:*****@localhost:5432/haystack", faiss_index_factory_str=index_type, similarity=similarity) assert document_store.get_document_count() == 0 else: raise Exception( f"No document store fixture for '{document_store_type}'") return document_store
def setup(self): print("SETTING UP PIPELINE") self.document_store = ElasticsearchDocumentStore( similarity="dot_product", host="elasticsearch", username="", password="", index="document") self.document_store_faiss = FAISSDocumentStore( index="document", faiss_index_factory_str="Flat", return_embedding=True, sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss" ) processor, converter = self.write_as4_docs() table_data = self.write_table_docs(converter, processor) es_retriever = ElasticsearchRetriever( document_store=self.document_store) print("SETTING UP DPR") dpr_retriever = DPRTrainingManager.get_current_retriever( self.document_store_faiss) print("SETTING UP EMBEDDINGS") embedding_retriever = EmbeddingRetriever( document_store=self.document_store_faiss, embedding_model="deepset/sentence_bert" ) query_classifier = QueryClassifier() print("SETTING UP TABLE") table_retriever = TableRetriever(table_data) print("SETUP RETRIEVERS") self.question_generator = FurtherQuestionGenerator() print("UPDATING EMBEDDINGS") self.document_store_faiss.update_embeddings(dpr_retriever) print("UPDATED EMBEDDINGS") self.dpr_node = ContinualDPRNode( dpr_retriever, self.document_store_faiss) result = Result() self.trainer = DPRTrainingManager( self.document_store_faiss, self.dpr_node) print("SETUP COMPONENTS") pipeline = Pipeline() pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) pipeline.add_node(component=self.dpr_node, name="DPRRetriever", inputs=["Query"]) pipeline.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[ "DPRRetriever", "EmbeddingRetriever", "ESRetriever"]) pipeline.add_node(component=query_classifier, name="QueryClassifier", inputs=["JoinResults"]) pipeline.add_node(component=self.question_generator, name="QnGenerator", inputs=["QueryClassifier.output_1"]) pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[ "QueryClassifier.output_2"]) pipeline.add_node(component=result, name="Result", inputs=[ "QnGenerator", "TableRetriever"]) self.pipeline = pipeline print("SETUP PIPELINE")
def test_elasticsearch_custom_query(elasticsearch_fixture): client = Elasticsearch() client.indices.delete(index='haystack_test_custom', ignore=[404]) document_store = ElasticsearchDocumentStore(index="haystack_test_custom", text_field="custom_text_field", embedding_field="custom_embedding_field") documents = [ {"text": "test_1", "meta": {"year": "2019"}}, {"text": "test_2", "meta": {"year": "2020"}}, {"text": "test_3", "meta": {"year": "2021"}}, {"text": "test_4", "meta": {"year": "2021"}}, {"text": "test_5", "meta": {"year": "2021"}}, ] document_store.write_documents(documents) # test custom "terms" query retriever = ElasticsearchRetriever( document_store=document_store, custom_query=""" { "size": 10, "query": { "bool": { "should": [{ "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}], "filter": [{"terms": {"year": ${years}}}]}}}""" ) results = retriever.run(query="test", filters={"years": ["2020", "2021"]})[0]["documents"] assert len(results) == 4 # test custom "term" query retriever = ElasticsearchRetriever( document_store=document_store, custom_query=""" { "size": 10, "query": { "bool": { "should": [{ "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}], "filter": [{"term": {"year": ${years}}}]}}}""" ) results = retriever.run(query="test", filters={"years": "2021"})[0]["documents"] assert len(results) == 3
def get_document_store(document_store_type, es_similarity='cosine'): """ TODO This method is taken from test/conftest.py but maybe should be within Haystack. Perhaps a class method of DocStore that just takes string for type of DocStore""" if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") elif document_store_type == "memory": document_store = InMemoryDocumentStore() elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="eval_document", similarity=es_similarity) elif document_store_type in ("faiss_flat", "faiss_hnsw"): if document_store_type == "faiss_flat": index_type = "Flat" elif document_store_type == "faiss_hnsw": index_type = "HNSW" #TEMP FIX for issue with deleting docs # status = subprocess.run( # ['docker rm -f haystack-postgres'], # shell=True) # time.sleep(3) # try: # document_store = FAISSDocumentStore(sql_url="postgresql://*****:*****@localhost:5432/haystack", # faiss_index_factory_str=index_type) # except: # Launch a postgres instance & create empty DB # logger.info("Didn't find Postgres. Start a new instance...") status = subprocess.run(['docker rm -f haystack-postgres'], shell=True) time.sleep(1) status = subprocess.run([ 'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres' ], shell=True) time.sleep(3) status = subprocess.run([ 'docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"' ], shell=True) time.sleep(1) document_store = FAISSDocumentStore( sql_url="postgresql://*****:*****@localhost:5432/haystack", faiss_index_factory_str=index_type) else: raise Exception( f"No document store fixture for '{document_store_type}'") assert document_store.get_document_count() == 0 return document_store
def get_document_store(document_store_type, embedding_dim=768, embedding_field="embedding"): if document_store_type == "sql": document_store = SQLDocumentStore(url="sqlite://", index="haystack_test") elif document_store_type == "memory": document_store = InMemoryDocumentStore(return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, index="haystack_test") elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore( index="haystack_test", return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field) elif document_store_type == "faiss": document_store = FAISSDocumentStore( vector_dim=embedding_dim, sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store elif document_store_type == "milvus": document_store = MilvusDocumentStore( vector_dim=embedding_dim, sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) _, collections = document_store.milvus_server.list_collections() for collection in collections: if collection.startswith("haystack_test"): document_store.milvus_server.drop_collection(collection) return document_store elif document_store_type == "weaviate": document_store = WeaviateDocumentStore( weaviate_url="http://localhost:8080", index="Haystacktest") document_store.weaviate_client.schema.delete_all() document_store._create_schema_and_index_if_not_exist() return document_store else: raise Exception( f"No document store fixture for '{document_store_type}'") return document_store
def test_init_elastic_client(): # defaults _ = ElasticsearchDocumentStore() # list of hosts + single port _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=9200) # list of hosts + list of ports (wrong) with pytest.raises(Exception): _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200]) # list of hosts + list _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200, 9200]) # only api_key with pytest.raises(Exception): _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test") # api_key + id _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
def launch_elasticsearch(self, launch: bool = False, name: str = "hera"): if launch: logging.info("Starting Elasticsearch ...") status = subprocess.run([ f'docker run -d -p 9200:9200 --name "hera" -e "discovery.type=single-node" elasticsearch:7.6.2' ], shell=True) time.sleep(30) else: logging.info("Starting Elasticsearch ...") try: status = subprocess.run(['docker stop hera'], shell=True) except: raise ("No running containers") finally: status = subprocess.run(['docker start hera'], shell=True) time.sleep(30) index = "document" document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index=index) dicts = convert_files_to_dicts(dir_path=self.data_path, clean_func=self.clean_website_text, split_paragraphs=True) try: document_store.delete_all_documents(index=index) except: pass finally: document_store.write_documents(dicts) return status
def update_document(): """Return a the url of the index document.""" if request.files: # index is the target document where queries need to sent. index = request.form['index'] # uploaded document for target source doc = request.files["doc"] file_path = os.path.join(app.config["input"], doc.filename) # saving the file to the input directory doc.save(file_path) #initialization of the Haystack Elasticsearch document storage document_store = ElasticsearchDocumentStore( host=app.config["host"], port=app.config["port"], username=app.config["username"], password=app.config["password"], index=index) # convert the pdf files into dictionary and update to ElasticSearch Document dicts = convert_files_to_dicts(app.config["input"], clean_func=clean_wiki_text, split_paragraphs=False) document_store.write_documents(dicts) os.remove(file_path) return json.dumps({ 'status': 'Susccess', 'message': 'document available at http://' + app.config["host"] + ':' + app.config["port"] + '/' + index + '/_search', 'result': [] }) else: return json.dumps({ 'status': 'Failed', 'message': 'No file uploaded', 'result': [] })
def launch_and_index_es(documents_dicts: List): es = Elasticsearch(['http://localhost:9200/'], verify_certs=True) if not es.ping(): logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") sleep(7) es.indices.delete(index='document', ignore=[400, 404]) document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") document_store.write_documents(documents_dicts) retriever = ElasticsearchRetriever(document_store=document_store) return retriever
def set_embeded(): """Return a friendly HTTP greeting.""" index = request.form['index'] document_store = ElasticsearchDocumentStore( host=app.config["host"], port=app.config["port"], username=app.config["username"], password=app.config["password"], index=index, embedding_field="embedding", embedding_dim=768) retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq", do_lower_case=True, use_gpu=False) #Now update the retriever embedded to the elasticsearch document document_store.update_embeddings(retriever) return json.dumps({ 'status': 'Susccess', 'message': 'Sucessfully embeded method updated in ElasticSearch Document', 'result': [] })
def qna(): """Return the n answers.""" question = request.form['question'] # index is the target document where queries need to sent. index = request.form['index'] # to select train or untrained model mode = request.form['mode'] #initialization of the Haystack Elasticsearch document storage document_store = ElasticsearchDocumentStore( host=app.config["host"], username=app.config["username"], password=app.config["password"], index=index) if mode == 'trained': # base on the search mode train_model reader = FARMReader(model_name_or_path=app.config["train_model"], use_gpu=False) else: # base on the search mode pre_train reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) #initialization of ElasticRetriever retriever = ElasticsearchRetriever(document_store=document_store) # Finder sticks together reader and retriever # in a pipeline to answer our actual questions. finder = Finder(reader, retriever) # predict n answers n = int(request.form['n']) prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=n) answer = [] for res in prediction['answers']: answer.append(res['answer']) return json.dumps({ 'status': 'success', 'message': 'Process succesfully', 'result': answer })
def __init__(self, id, add_sample_data=False): Model.__init__(self, id) doc_store = ElasticsearchDocumentStore( host=DB_HOST, port=DB_PORT, index=self.id, embedding_field="question_emb", embedding_dim=768, excluded_meta_data=["question_emb"]) retriever = EmbeddingRetriever(document_store=doc_store, embedding_model="deepset/sentence_bert", use_gpu=False) self.finder = Finder(reader=None, retriever=retriever) if add_sample_data: add_sample_data_faq_qa(self)
def get_document_store(document_store_type, faiss_document_store, inmemory_document_store): if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") elif document_store_type == "memory": document_store = inmemory_document_store elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=False) elif document_store_type == "faiss": document_store = faiss_document_store else: raise Exception(f"No document store fixture for '{document_store_type}'") return document_store
def __init__(self, id, add_sample_data=False): Model.__init__(self, id) doc_store = ElasticsearchDocumentStore(host=DB_HOST, port=DB_PORT, index=self.id) retriever = ElasticsearchRetriever(document_store=doc_store) reader = FARMReader( model_name_or_path=READER_MODEL_PATH, batch_size=BATCHSIZE, use_gpu=False, num_processes=MAX_PROCESSES, ) self.finder = Finder(reader, retriever) if add_sample_data: add_sample_data_doc_qa(self) reader.save(directory=READER_MODEL_PATH) print("saved")
def create_app(config_name): app = Flask(__name__) app.config.from_object(config[config_name]) host = config[config_name].ELASTIC_URL port = config[config_name].ELASTIC_PORT index = config[config_name].ELASTIC_INDEX doc_store = ElasticsearchDocumentStore(host=host, username='', password='', index=index) retriever = ElasticsearchRetriever(document_store=doc_store) model_name = "deepset/roberta-base-squad2" reader = FARMReader(model_name_or_path=model_name, num_processes=0, use_gpu=False) app.finder = Finder(reader, retriever) from app.main import main as main_blueprint app.register_blueprint(main_blueprint) return app
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore from tqdm import tqdm document_store = ElasticsearchDocumentStore(refresh_type='false') def concat_sents(sent): i = 0 ss = '' for s in sent: i += len(str(s).split()) ss += s if i > 200: i = 0 yield ss ss = '' with open('../data/arxiv-processed-pickle', 'rb') as f: import pickle dic = pickle.load(f) for (sents, name) in tqdm(dic): to_insert = [] for s in concat_sents(sents): to_insert.append({'text': s, 'meta': {'name': name}}) document_store.write_documents(to_insert)
from transformers import AutoTokenizer, AutoModel from definitions import from_root_dir from haystack.document_store.elasticsearch import ElasticsearchDocumentStore from pulp.cy.dense.recv import TransformersEmbeddingRetriever def to_meta_dict(meta: dict) -> dict: abs = meta['abstract'] if 'abstract' in meta.keys() else None if 'abstract' in meta: del meta['abstract'] return {'text': abs, 'meta': meta} store = ElasticsearchDocumentStore(refresh_type='false', index='meta') retriever = TransformersEmbeddingRetriever( document_store=store, embedding_model=AutoModel.from_pretrained( from_root_dir('models/scibert_scivocab_uncased')), tokenizer=AutoTokenizer.from_pretrained( from_root_dir('models/scibert_scivocab_uncased'))) with open(from_root_dir('data/arxiv-metadata_pickle'), 'rb') as f: import pickle l = pickle.load(f) store.write_documents((to_meta_dict(m) for m in l)) store.update_embeddings(retriever, index='meta') # retriever.embed(['It is shown that, within a Ginzburg-Landau (GL) formalism, the\nsuperconducting fluctuation is insulating at zero temperature even if the\nfluctuation dynamics is metallic (dissipative). Based on this fact, the low\ntemperature behavior of the $H_{c2}$-line and the resistivity curves near a\nzero temperature transition are discussed. In particular, it is pointed out\nthat the neglect of quantum fluctuations in data analysis of the dc resistivity\nmay lead to an under-estimation of the $H_{c2}$ values near zero temperature.\n'])
if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") # ## Preprocessing of documents # # Haystack provides a customizable pipeline for: # - converting files into texts # - cleaning texts # - splitting texts # - writing them to a Document Store # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add # them in Elasticsearch. # Let's first fetch some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones
import json import glob import os import pprint import re from bs4 import BeautifulSoup import logging from haystack.preprocessor.utils import convert_files_to_dicts from haystack.utils import print_answers from haystack.document_store.elasticsearch import ElasticsearchDocumentStore from haystack.retriever.sparse import ElasticsearchRetriever DB_HOST = os.getenv("DB_HOST", "localhost") document_store = ElasticsearchDocumentStore(host=DB_HOST, username="", password="", index="document") document_store.delete_all_documents(index='document') # Elesier dataset with full text def nf2020toDict(): paths = "./data/HACKXML0000000004/**/*.xml" target_tags = ['simple-para', 'para'] docs = [] for path in glob.glob(paths)[1:]: with open(path, 'r') as f: data = BeautifulSoup(f.read(), "xml") temp = {} temp["meta"] = {}
reader_name = "deepset/roberta-base-squad2" top_k_retriever = 7 top_k_reader = 1 conversational = 'True' # Use transfromer reader reader = FARMReader(model_name_or_path=reader_name, use_gpu=True) print('Fetching documents for book ' + book_title) document_fetcher_func = top_50_wiki_results_2 num_docs = document_fetcher_func(book_title) print('Fetched ' + str(num_docs) + ' documents for book ' + book_title) document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="default") document_store.delete_all_documents(index="default") #document_store = InMemoryDocumentStore() doc_dir = root + "/documents" dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Add documents to the document store document_store.write_documents(dicts) # Use ElasticsearchRetriever retriever = ElasticsearchRetriever(document_store=document_store) #retriever = TfidfRetriever(document_store=document_store)
prepared_segment["text"] = segment prepared_segments_part.append(prepared_segment) if i + 1 % SEGMENT_BATCH_SIZE == 0: prepared_segments.append(prepared_segments_part) prepared_segments_part = [] if prepared_segments_part: prepared_segments.append(prepared_segments_part) return prepared_segments from haystack.document_store.elasticsearch import ElasticsearchDocumentStore # init haystack ES client with custom mappings document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", custom_mapping=mapping) # ingest full document (no splitting to segments) def prepare_ingest_full_document(sector, date, title, link, page_content): document = {} document["title"] = title document["sector"] = sector.strip() document["given_date"] = date try: document["date"] = str(datetime.strptime(date, "%d %b %Y").date()) except: pass document["link"] = link
from haystack.retriever.sparse import ElasticsearchRetriever, ElasticsearchFilterOnlyRetriever from haystack.retriever.dense import EmbeddingRetriever logger = logging.getLogger(__name__) router = APIRouter() # Init global components: DocumentStore, Retriever, Reader, Finder document_store = ElasticsearchDocumentStore( host=DB_HOST, port=DB_PORT, username=DB_USER, password=DB_PW, index=DB_INDEX, scheme=ES_CONN_SCHEME, ca_certs=False, verify_certs=False, text_field=TEXT_FIELD_NAME, name_field=NAME_FIELD_NAME, search_fields=SEARCH_FIELD_NAME, embedding_dim=EMBEDDING_DIM, embedding_field=EMBEDDING_FIELD_NAME, excluded_meta_data=EXCLUDE_META_DATA_FIELDS, # type: ignore faq_question_field=FAQ_QUESTION_FIELD_NAME, ) if RETRIEVER_TYPE == "EmbeddingRetriever": retriever = EmbeddingRetriever( document_store=document_store, embedding_model=EMBEDDING_MODEL_PATH, model_format=EMBEDDING_MODEL_FORMAT,
def tutorial1_basic_qa_pipeline(): logger = logging.getLogger(__name__) LAUNCH_ELASTICSEARCH = True # ## Document Store # # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`. # # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval, # and vector storage for text embeddings. # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3 # for using SQL/InMemory document stores. # **Hint**: # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can # configure Haystack to work with your existing document stores. # # Start an Elasticsearch server # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") # ## Preprocessing of documents # # Haystack provides a customizable pipeline for: # - converting files into texts # - cleaning texts # - splitting texts # - writing them to a Document Store # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add # them in Elasticsearch. # Let's first fetch some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. if LAUNCH_ELASTICSEARCH: document_store.write_documents(dicts) else: logger.warning( "Since we already have a running ES instance we should not index the same documents again. \n" "If you still want to do this call: document_store.write_documents(dicts) manually " ) # ## Initalize Retriever, Reader, & Finder # # ### Retriever # # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question # could be answered. # # They use some simple but fast algorithm. # **Here:** We use Elasticsearch's default BM25 algorithm # **Alternatives:** # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of # embeddings (e.g. created via Sentence-BERT) # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes # with SQLite document store. # # from haystack.retriever.tfidf import TfidfRetriever # retriever = TfidfRetriever(document_store=document_store) # ### Reader # # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based # on powerful, but slower deep learning models. # # Haystack currently supports Readers based on the frameworks FARM and Transformers. # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). # **Here:** a medium sized RoBERTa QA model using a Reader based on # FARM (https://huggingface.co/deepset/roberta-base-squad2) # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or # "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean # the model prefers "no answer possible" # # #### FARMReader # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # #### TransformersReader # Alternative: # reader = TransformersReader( # model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # ### Pipeline # # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
ES_CONN_SCHEME, TEXT_FIELD_NAME, SEARCH_FIELD_NAME, EMBEDDING_DIM, EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, FAQ_QUESTION_FIELD_NAME, CREATE_INDEX, VECTOR_SIMILARITY_METRIC, UPDATE_EXISTING_DOCUMENTS) router = APIRouter() document_store = ElasticsearchDocumentStore( host=DB_HOST, port=DB_PORT, username=DB_USER, password=DB_PW, index=DB_INDEX, label_index=DB_INDEX_FEEDBACK, scheme=ES_CONN_SCHEME, ca_certs=False, verify_certs=False, text_field=TEXT_FIELD_NAME, search_fields=SEARCH_FIELD_NAME, faq_question_field=FAQ_QUESTION_FIELD_NAME, embedding_dim=EMBEDDING_DIM, embedding_field=EMBEDDING_FIELD_NAME, excluded_meta_data=EXCLUDE_META_DATA_FIELDS, # type: ignore create_index=CREATE_INDEX, update_existing_documents=UPDATE_EXISTING_DOCUMENTS, similarity=VECTOR_SIMILARITY_METRIC) class FAQQAFeedback(BaseModel): question: str = Field( ..., description="The question input by the user, i.e., the query.") is_correct_answer: bool = Field(