def get_document_store(document_store_type, similarity='dot_product'): """ TODO This method is taken from test/conftest.py but maybe should be within Haystack. Perhaps a class method of DocStore that just takes string for type of DocStore""" if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") assert document_store.get_document_count() == 0 elif document_store_type == "memory": document_store = InMemoryDocumentStore() elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="eval_document", similarity=similarity, timeout=3000) elif document_store_type in ("milvus_flat", "milvus_hnsw"): if document_store_type == "milvus_flat": index_type = IndexType.FLAT index_param = None search_param = None elif document_store_type == "milvus_hnsw": index_type = IndexType.HNSW index_param = {"M": 64, "efConstruction": 80} search_param = {"ef": 20} document_store = MilvusDocumentStore(similarity=similarity, index_type=index_type, index_param=index_param, search_param=search_param) assert document_store.get_document_count(index="eval_document") == 0 elif document_store_type in ("faiss_flat", "faiss_hnsw"): if document_store_type == "faiss_flat": index_type = "Flat" elif document_store_type == "faiss_hnsw": index_type = "HNSW" status = subprocess.run(['docker rm -f haystack-postgres'], shell=True) time.sleep(1) status = subprocess.run([ 'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres' ], shell=True) time.sleep(6) status = subprocess.run([ 'docker exec haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"' ], shell=True) time.sleep(1) document_store = FAISSDocumentStore( sql_url="postgresql://*****:*****@localhost:5432/haystack", faiss_index_factory_str=index_type, similarity=similarity) assert document_store.get_document_count() == 0 else: raise Exception( f"No document store fixture for '{document_store_type}'") return document_store
def get_document_store(document_store_type, embedding_dim=768, embedding_field="embedding"): if document_store_type == "sql": document_store = SQLDocumentStore(url="sqlite://", index="haystack_test") elif document_store_type == "memory": document_store = InMemoryDocumentStore(return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, index="haystack_test") elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore( index="haystack_test", return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field) elif document_store_type == "faiss": document_store = FAISSDocumentStore( vector_dim=embedding_dim, sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store elif document_store_type == "milvus": document_store = MilvusDocumentStore( vector_dim=embedding_dim, sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) _, collections = document_store.milvus_server.list_collections() for collection in collections: if collection.startswith("haystack_test"): document_store.milvus_server.drop_collection(collection) return document_store elif document_store_type == "weaviate": document_store = WeaviateDocumentStore( weaviate_url="http://localhost:8080", index="Haystacktest") document_store.weaviate_client.schema.delete_all() document_store._create_schema_and_index_if_not_exist() return document_store else: raise Exception( f"No document store fixture for '{document_store_type}'") return document_store
def load(self): if(self.finder and self.finder2): return if(not self.document_store2): self.document_store2 = FAISSDocumentStore.load( sql_url=sqlUrlFAQ, faiss_file_path='faiss2') # save before load in preprocess self.initSql(url=sqlUrlFAQ, document_store=self.document_store2) # else: # reset session # # self.document_store2.session.close() # super( # FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ) if(not self.retriever2): self.retriever2 = EmbeddingRetriever(document_store=self.document_store2, embedding_model="sentence_bert-saved", use_gpu=False) if(not self.finder2): self.finder2 = Finder(reader=None, retriever=self.retriever2) if(not self.document_store): self.document_store = SQLDocumentStore(url=sqlUrl) #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl) self.initSql(url=sqlUrl, document_store=self.document_store) # else: # reset session # # self.document_store.session.close() # super( # FAISSDocumentStore, self.document_store).__init__(url=sqlUrl) # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly?? # document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever if(not self.retriever): self.retriever = TfidfRetriever(document_store=self.document_store) self.reader = FARMReader(model_name_or_path=modelDir, use_gpu=False, no_ans_boost=0) if not self.reader else self.reader # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) self.finder = Finder( self.reader, self.retriever) if not self.finder else self.finder
def get_document_store(document_store_type, embedding_field="embedding"): if document_store_type == "sql": document_store = SQLDocumentStore(url="sqlite://", index="haystack_test") elif document_store_type == "memory": document_store = InMemoryDocumentStore( return_embedding=True, embedding_field=embedding_field, index="haystack_test" ) elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore( index="haystack_test", return_embedding=True, embedding_field=embedding_field ) elif document_store_type == "faiss": document_store = FAISSDocumentStore( sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store elif document_store_type == "milvus": document_store = MilvusDocumentStore( sql_url="sqlite://", return_embedding=True, embedding_field=embedding_field, index="haystack_test", ) return document_store else: raise Exception(f"No document store fixture for '{document_store_type}'") return document_store
def get_document_store(document_store_type, es_similarity='cosine'): """ TODO This method is taken from test/conftest.py but maybe should be within Haystack. Perhaps a class method of DocStore that just takes string for type of DocStore""" if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") elif document_store_type == "memory": document_store = InMemoryDocumentStore() elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="eval_document", similarity=es_similarity) elif document_store_type in ("faiss_flat", "faiss_hnsw"): if document_store_type == "faiss_flat": index_type = "Flat" elif document_store_type == "faiss_hnsw": index_type = "HNSW" #TEMP FIX for issue with deleting docs # status = subprocess.run( # ['docker rm -f haystack-postgres'], # shell=True) # time.sleep(3) # try: # document_store = FAISSDocumentStore(sql_url="postgresql://*****:*****@localhost:5432/haystack", # faiss_index_factory_str=index_type) # except: # Launch a postgres instance & create empty DB # logger.info("Didn't find Postgres. Start a new instance...") status = subprocess.run(['docker rm -f haystack-postgres'], shell=True) time.sleep(1) status = subprocess.run([ 'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres' ], shell=True) time.sleep(3) status = subprocess.run([ 'docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"' ], shell=True) time.sleep(1) document_store = FAISSDocumentStore( sql_url="postgresql://*****:*****@localhost:5432/haystack", faiss_index_factory_str=index_type) else: raise Exception( f"No document store fixture for '{document_store_type}'") return document_store
def get_document_store(document_store_type, faiss_document_store, inmemory_document_store): if document_store_type == "sql": if os.path.exists("haystack_test.db"): os.remove("haystack_test.db") document_store = SQLDocumentStore(url="sqlite:///haystack_test.db") elif document_store_type == "memory": document_store = inmemory_document_store elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=False) elif document_store_type == "faiss": document_store = faiss_document_store else: raise Exception(f"No document store fixture for '{document_store_type}'") return document_store
processor = Processor.convert_from_transformers(modelName, task_type="question_answering", max_seq_len=384, doc_stride=128) model.save(modelDir) processor.save(modelDir) # try: # //or we should config the cache_dir on from_pretrain # dont rely on cache , must save it with save() # model = TransformersReader(model_name_or_path=modelName, use_gpu=-1) # model = FARMReader(model_name_or_path=modelName, # use_gpu=False, no_ans_boost=0) # except: # pass; document_store = SQLDocumentStore(url=sqlUrl) # document_store = FAISSDocumentStore(sql_url=sqlUrl) # convert files to dicts containing documents that can be indexed to our datastore # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=False) # dicts = tika_convert_files_to_dicts(dir_path=doc_dir,clean_func=clean_wiki_text, split_paragraphs=False) # We now have a list of dictionaries that we can write to our document store. # If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself. # The default format here is: {"name": "<some-document-name>, "text": "<the-actual-text>"} # Let's have a look at the first 3 entries:
from haystack import Finder from haystack.preprocessor.cleaning import clean_wiki_text from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.utils import print_answers # In-Memory Document Store # from haystack.document_store.memory import InMemoryDocumentStore # document_store = InMemoryDocumentStore() from haystack.document_store.sql import SQLDocumentStore document_store = SQLDocumentStore(url="sqlite:///qa.db") import shutil import os # Let's first get some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got2" # s3_url = "https://drive.google.com/uc?export=download&id=1tgQhrDu5cZJ9xp2Uj3rzfa0j0OuW3UPV" # "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" # fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # We now have a list of dictionaries that we can write to our document store.