Пример #1
0
def test_db_write_read():
    sql_document_store = SQLDocumentStore()
    write_documents_to_db(document_store=sql_document_store,
                          document_dir="samples/docs")
    documents = sql_document_store.get_all_documents()
    assert len(documents) == 2
    doc = sql_document_store.get_document_by_id("1")
    assert doc.keys() == {"id", "name", "text", "tags"}
Пример #2
0
def test_db_write_read():
    from haystack.database import db
    db.drop_all()
    db.create_all()

    write_documents_to_db(document_dir="samples/docs")
    documents = db.session.query(Document).order_by(Document.text).all()
    assert len(documents) == 2
    assert documents[0].text == 'A Doc specifically talking about haystack.\nHaystack can be used to scale QA models to large document collections.'
Пример #3
0
def test_sql_write_read():
    sql_document_store = SQLDocumentStore()
    write_documents_to_db(document_store=sql_document_store,
                          document_dir="samples/docs")
    documents = sql_document_store.get_all_documents()
    assert len(documents) == 2
    doc = sql_document_store.get_document_by_id("1")
    assert doc.id
    assert doc.text
Пример #4
0
def test_elasticsearch_write_read(elasticsearch_fixture):
    document_store = ElasticsearchDocumentStore()
    write_documents_to_db(document_store=document_store,
                          document_dir="samples/docs")
    sleep(2)  # wait for documents to be available for query
    documents = document_store.get_all_documents()
    assert len(documents) == 2
    assert documents[0].id
    assert documents[0].text
Пример #5
0
def write_to_db():
    try:
        ## TODO: Get DOCS_DIR from config

        write_documents_to_db(document_store=document_store, document_dir=doc_dir, \
                      only_empty_db=True, split_paragraphs=True)

        return True
    except :
        return jsonify("Can not write to DB")
Пример #6
0
"""
Load preprocessed text files into ES
"""

import logging
import subprocess
import time

from haystack import Finder
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from haystack.retriever.elasticsearch import ElasticsearchRetriever

#load data into ES

doc_dir = "/home/sebastian/SideProject/QA/wikiextractor/preprocessed/folder_1"
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document")
write_documents_to_db(
    document_store=document_store,
    document_dir=doc_dir,
    #clean_func=clean_wiki_text,
    only_empty_db=False,
    split_paragraphs=True)
Пример #7
0
# Let's first get some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# The documents can be stored in different types of "DocumentStores".
# For dev we suggest a light-weight SQL DB
# For production we suggest elasticsearch
document_store = SQLDocumentStore(url="sqlite:///qa.db")

# Now, let's write the docs to our DB.
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
write_documents_to_db(document_store=document_store,
                      document_dir=doc_dir,
                      clean_func=clean_wiki_text,
                      only_empty_db=True)

## Initalize Reader, Retriever & Finder

# A retriever identifies the k most promising chunks of text that might contain the answer for our question
# Retrievers use some simple but fast algorithm, here: TF-IDF
retriever = TfidfRetriever(document_store=document_store)

# A reader scans the text chunks in detail and extracts the k best answers
# Reader use more powerful but slower deep learning models
# You can select a local model or  any of the QA models published on huggingface's model hub (https://huggingface.co/models)
# here: a medium sized BERT QA model trained via FARM on Squad 2.0
reader = FARMReader(model_name_or_path="deepset/bert-base-cased-squad2",
                    use_gpu=False)
Пример #8
0
#TODO Enable CORS

MODELS_DIRS = ["model"]
USE_GPU = False
BATCH_SIZE = 16

from haystack.database import db

db.create_all()

# Let's first get some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "../data"
# Note: requires changing the function in io.py and adding encoding='utf-8' in our case
write_documents_to_db(document_dir=doc_dir,
                      clean_func=clean_wiki_text)  # , only_empty_db=True

app = FastAPI(title="Haystack API for Taschenhirn", version="0.1")

#############################################
# Load all models in memory

#############################################
## Indexing & cleaning documents
# Init a database (default: sqllite)

model_paths = []
for model_dir in MODELS_DIRS:
    path = Path(model_dir)
    if path.is_dir():
        models = [f for f in path.iterdir() if f.is_dir()]