Exemplo n.º 1
0
def test_tfidf_retriever():
    from haystack.retriever.tfidf import TfidfRetriever

    test_docs = [{
        "name": "testing the finder 1",
        "text": "godzilla says hello"
    }, {
        "name": "testing the finder 2",
        "text": "optimus prime says bye"
    }, {
        "name": "testing the finder 3",
        "text": "alien says arghh"
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    retriever = TfidfRetriever(document_store)
    retriever.fit()
    assert retriever.retrieve("godzilla", top_k=1) == [
        Document(id='0',
                 text='godzilla says hello',
                 external_source_id=None,
                 question=None,
                 query_score=None,
                 meta={})
    ]
Exemplo n.º 2
0
def test_finder_get_answers_with_in_memory_store():
    test_docs = [{
        "name": "testing the finder 1",
        "text": "testing the finder with pyhton unit test 1",
        'meta': {
            'url': 'url'
        }
    }, {
        "name": "testing the finder 2",
        "text": "testing the finder with pyhton unit test 2",
        'meta': {
            'url': 'url'
        }
    }, {
        "name": "testing the finder 3",
        "text": "testing the finder with pyhton unit test 3",
        'meta': {
            'url': 'url'
        }
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    retriever = TfidfRetriever(document_store=document_store)
    reader = TransformersReader(
        model="distilbert-base-uncased-distilled-squad",
        tokenizer="distilbert-base-uncased",
        use_gpu=-1)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="testing finder",
                                    top_k_retriever=10,
                                    top_k_reader=5)
    assert prediction is not None
Exemplo n.º 3
0
def test_faq_retriever_in_memory_store():

    from haystack.database.memory import InMemoryDocumentStore
    from haystack.retriever.dense import EmbeddingRetriever

    document_store = InMemoryDocumentStore(embedding_field="embedding")

    documents = [
        {'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
    ]

    retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)

    embedded = []
    for doc in documents:
        doc['embedding'] = retriever.embed([doc['meta']['question']])[0]
        embedded.append(doc)

    document_store.write_documents(embedded)

    finder = Finder(reader=None, retriever=retriever)
    prediction = finder.get_answers_via_similar_questions(question="How to test this?", top_k_retriever=1)

    assert len(prediction.get('answers', [])) == 1
Exemplo n.º 4
0
def test_memory_store_get_by_tags():
    test_docs = [{
        "name": "testing the finder 1",
        "text": "testing the finder with pyhton unit test 1",
        'meta': {
            'url': 'url'
        }
    }, {
        "name": "testing the finder 2",
        "text": "testing the finder with pyhton unit test 2",
        'meta': {
            'url': None
        }
    }, {
        "name": "testing the finder 3",
        "text": "testing the finder with pyhton unit test 3",
        'meta': {
            'url': 'url'
        }
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    docs = document_store.get_document_ids_by_tags({'has_url': 'false'})

    assert docs == []
Exemplo n.º 5
0
def test_tfidf_retriever():
    from haystack.retriever.sparse import TfidfRetriever

    test_docs = [{
        "id": "26f84672c6d7aaeb8e2cd53e9c62d62d",
        "name": "testing the finder 1",
        "text": "godzilla says hello"
    }, {
        "name": "testing the finder 2",
        "text": "optimus prime says bye"
    }, {
        "name": "testing the finder 3",
        "text": "alien says arghh"
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    retriever = TfidfRetriever(document_store)
    retriever.fit()
    doc = retriever.retrieve("godzilla", top_k=1)[0]
    assert doc.id == "26f84672c6d7aaeb8e2cd53e9c62d62d"
    assert doc.text == 'godzilla says hello'
    assert doc.meta == {"name": "testing the finder 1"}
Exemplo n.º 6
0
def test_memory_store_get_by_tag_lists_non_existent_tag():
    test_docs = [
        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
    ]
    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)
    docs = document_store.get_document_ids_by_tags({'tag1': ["3"]})
    assert docs == []
Exemplo n.º 7
0
def test_memory_store_get_by_tag_lists_union():
    test_docs = [{
        "name": "testing the finder 1",
        "text": "testing the finder with pyhton unit test 1",
        'meta': {
            'url': 'url'
        },
        'tags': [{
            'tag2': ["1"]
        }]
    }, {
        "name": "testing the finder 2",
        "text": "testing the finder with pyhton unit test 2",
        'meta': {
            'url': None
        },
        'tags': [{
            'tag1': ['1']
        }]
    }, {
        "name": "testing the finder 3",
        "text": "testing the finder with pyhton unit test 3",
        'meta': {
            'url': 'url'
        },
        'tags': [{
            'tag2': ["1", "2"]
        }]
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    docs = document_store.get_document_ids_by_tags({'tag2': ["1"]})

    assert docs == [{
        'name': 'testing the finder 1',
        'text': 'testing the finder with pyhton unit test 1',
        'meta': {
            'url': 'url'
        },
        'tags': [{
            'tag2': ['1']
        }]
    }, {
        'name': 'testing the finder 3',
        'text': 'testing the finder with pyhton unit test 3',
        'meta': {
            'url': 'url'
        },
        'tags': [{
            'tag2': ['1', '2']
        }]
    }]
Exemplo n.º 8
0
def test_dpr_inmemory_retrieval():
    document_store = InMemoryDocumentStore()

    documents = [
        {
            'name':
            '0',
            'text':
            """Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from"""
        },
        {
            'name':
            '1',
            'text':
            """Schopenhauer, describing him as an ultimately shallow thinker: ""Schopenhauer has quite a crude mind ... where real depth starts, his comes to an end."" His friend Bertrand Russell had a low opinion on the philosopher, and attacked him in his famous ""History of Western Philosophy"" for hypocritically praising asceticism yet not acting upon it. On the opposite isle of Russell on the foundations of mathematics, the Dutch mathematician L. E. J. Brouwer incorporated the ideas of Kant and Schopenhauer in intuitionism, where mathematics is considered a purely mental activity, instead of an analytic activity wherein objective properties of reality are"""
        },
        {
            'name':
            '2',
            'text':
            """Democratic Republic of the Congo to the south. Angola's capital, Luanda, lies on the Atlantic coast in the northwest of the country. Angola, although located in a tropical zone, has a climate that is not characterized for this region, due to the confluence of three factors: As a result, Angola's climate is characterized by two seasons: rainfall from October to April and drought, known as ""Cacimbo"", from May to August, drier, as the name implies, and with lower temperatures. On the other hand, while the coastline has high rainfall rates, decreasing from North to South and from to , with"""
        },
    ]

    retriever = DensePassageRetriever(document_store=document_store,
                                      embedding_model="dpr-bert-base-nq",
                                      use_gpu=False)

    embedded = []
    for doc in documents:
        embedding = retriever.embed_passages([doc['text']])[0]
        doc['embedding'] = embedding
        embedded.append(doc)

        assert (embedding.shape[0] == 768)
        assert (embedding[0] - 0.52872 < 0.001)

    document_store.write_documents(embedded)

    res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")
    assert res[0].text == documents[1]["text"]
Exemplo n.º 9
0
def document_store(request, test_docs_xs, elasticsearch_fixture):
    if request.param == "sql":
        if os.path.exists("qa_test.db"):
            os.remove("qa_test.db")
        document_store = SQLDocumentStore(url="sqlite:///qa_test.db")

    if request.param == "memory":
        document_store = InMemoryDocumentStore()

    if request.param == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="haystack_test")

    return document_store
Exemplo n.º 10
0
def get_document_store(document_store_type):
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="haystack_test")
    elif document_store_type == "faiss":
        if os.path.exists("haystack_test_faiss.db"):
            os.remove("haystack_test_faiss.db")
        document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db")
    else:
        raise Exception(f"No document store fixture for '{document_store_type}'")

    return document_store
Exemplo n.º 11
0
from haystack import Finder
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
import json

# Connect to Elasticsearch

# from haystack.database.elasticsearch import ElasticsearchDocumentStore

# don't use elasticsearch for now
# document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

from haystack.database.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()

def build_dataset():
    with open('data.json') as f:
        data = json.load(f)
    for article in data:
        filename = article['url'].split('/')[-1]
        with open('articles/%s.txt' % filename, 'w') as f:
            print(article['text'], file=f)
    

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.

doc_dir = 'articles'
Exemplo n.º 12
0
import os

from haystack import Finder
from haystack.database.memory import InMemoryDocumentStore
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.retriever.tfidf import TfidfRetriever
from haystack.utils import print_answers

if __name__ == '__main__':
    # In-Memory Document Store
    document_store = InMemoryDocumentStore()

    # ## Cleaning & indexing documents
    #
    # Haystack provides a customizable cleaning and indexing pipeline for ingesting documents in Document Stores.

    # set path to directory conating the text files
    doc_dir = os.getcwd() + "\\kbQA\\data\\article_txt_got"
    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)