예제 #1
0
    def launch_elasticsearch(self, launch: bool = False, name: str = "hera"):
        if launch:
            logging.info("Starting Elasticsearch ...")
            status = subprocess.run([
                f'docker run -d -p 9200:9200 --name "hera" -e "discovery.type=single-node" elasticsearch:7.6.2'
            ],
                                    shell=True)
            time.sleep(30)
        else:
            logging.info("Starting Elasticsearch ...")
            try:
                status = subprocess.run(['docker stop hera'], shell=True)
            except:
                raise ("No running containers")

            finally:
                status = subprocess.run(['docker start hera'], shell=True)
                time.sleep(30)

        index = "document"
        document_store = ElasticsearchDocumentStore(host="localhost",
                                                    username="",
                                                    password="",
                                                    index=index)

        dicts = convert_files_to_dicts(dir_path=self.data_path,
                                       clean_func=self.clean_website_text,
                                       split_paragraphs=True)
        try:
            document_store.delete_all_documents(index=index)
        except:
            pass
        finally:
            document_store.write_documents(dicts)
        return status
예제 #2
0
def test_get_document_count_only_documents_without_embedding_arg():
    documents = [
        {
            "text": "text1",
            "id": "1",
            "embedding": np.random.rand(768).astype(np.float32),
            "meta_field_for_count": "a"
        },
        {
            "text": "text2",
            "id": "2",
            "embedding": np.random.rand(768).astype(np.float64),
            "meta_field_for_count": "b"
        },
        {
            "text": "text3",
            "id": "3",
            "embedding": np.random.rand(768).astype(np.float32).tolist()
        },
        {
            "text": "text4",
            "id": "4",
            "meta_field_for_count": "b"
        },
        {
            "text": "text5",
            "id": "5",
            "meta_field_for_count": "b"
        },
        {
            "text": "text6",
            "id": "6",
            "meta_field_for_count": "c"
        },
        {
            "text": "text7",
            "id": "7",
            "embedding": np.random.rand(768).astype(np.float64),
            "meta_field_for_count": "c"
        },
    ]

    _index: str = "haystack_test_count"
    document_store = ElasticsearchDocumentStore(index=_index)
    document_store.delete_documents(index=_index)

    document_store.write_documents(documents)

    assert document_store.get_document_count() == 7
    assert document_store.get_document_count(
        only_documents_without_embedding=True) == 3
    assert document_store.get_document_count(
        only_documents_without_embedding=True,
        filters={"meta_field_for_count": ["c"]}) == 1
    assert document_store.get_document_count(
        only_documents_without_embedding=True,
        filters={"meta_field_for_count": ["b"]}) == 2
예제 #3
0
def test_elasticsearch_custom_fields(elasticsearch_fixture):
    client = Elasticsearch()
    client.indices.delete(index='haystack_test_custom', ignore=[404])
    document_store = ElasticsearchDocumentStore(index="haystack_test_custom", text_field="custom_text_field",
                                                embedding_field="custom_embedding_field")

    doc_to_write = {"custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}
    document_store.write_documents([doc_to_write])
    documents = document_store.get_all_documents(return_embedding=True)
    assert len(documents) == 1
    assert documents[0].text == "test"
    np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
def test_elasticsearch_custom_query(elasticsearch_fixture):
    client = Elasticsearch()
    client.indices.delete(index='haystack_test_custom', ignore=[404])
    document_store = ElasticsearchDocumentStore(index="haystack_test_custom", text_field="custom_text_field",
                                                embedding_field="custom_embedding_field")
    documents = [
        {"text": "test_1", "meta": {"year": "2019"}},
        {"text": "test_2", "meta": {"year": "2020"}},
        {"text": "test_3", "meta": {"year": "2021"}},
        {"text": "test_4", "meta": {"year": "2021"}},
        {"text": "test_5", "meta": {"year": "2021"}},
    ]
    document_store.write_documents(documents)

    # test custom "terms" query
    retriever = ElasticsearchRetriever(
        document_store=document_store,
        custom_query="""
            {
                "size": 10, 
                "query": {
                    "bool": {
                        "should": [{
                            "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
                            "filter": [{"terms": {"year": ${years}}}]}}}"""
    )
    results = retriever.run(query="test", filters={"years": ["2020", "2021"]})[0]["documents"]
    assert len(results) == 4

    # test custom "term" query
    retriever = ElasticsearchRetriever(
        document_store=document_store,
        custom_query="""
                {
                    "size": 10, 
                    "query": {
                        "bool": {
                            "should": [{
                                "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
                                "filter": [{"term": {"year": ${years}}}]}}}"""
    )
    results = retriever.run(query="test", filters={"years": "2021"})[0]["documents"]
    assert len(results) == 3
예제 #5
0
def update_document():
    """Return a the url of the index document."""
    if request.files:
        # index is the target document where queries need to sent.
        index = request.form['index']
        # uploaded document for target source
        doc = request.files["doc"]

        file_path = os.path.join(app.config["input"], doc.filename)

        # saving the file to the input directory
        doc.save(file_path)
        #initialization of the Haystack Elasticsearch document storage
        document_store = ElasticsearchDocumentStore(
            host=app.config["host"],
            port=app.config["port"],
            username=app.config["username"],
            password=app.config["password"],
            index=index)
        # convert the pdf files into dictionary and update to ElasticSearch Document
        dicts = convert_files_to_dicts(app.config["input"],
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=False)
        document_store.write_documents(dicts)
        os.remove(file_path)
        return json.dumps({
            'status':
            'Susccess',
            'message':
            'document available at http://' + app.config["host"] + ':' +
            app.config["port"] + '/' + index + '/_search',
            'result': []
        })
    else:
        return json.dumps({
            'status': 'Failed',
            'message': 'No file uploaded',
            'result': []
        })
예제 #6
0
파일: squad2dpr.py 프로젝트: psorianom/DPR
def launch_and_index_es(documents_dicts: List):
    es = Elasticsearch(['http://localhost:9200/'], verify_certs=True)
    if not es.ping():
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        sleep(7)

    es.indices.delete(index='document', ignore=[400, 404])
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")
    document_store.write_documents(documents_dicts)
    retriever = ElasticsearchRetriever(document_store=document_store)
    return retriever
예제 #7
0
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from tqdm import tqdm

document_store = ElasticsearchDocumentStore(refresh_type='false')


def concat_sents(sent):
    i = 0
    ss = ''
    for s in sent:
        i += len(str(s).split())
        ss += s
        if i > 200:
            i = 0
            yield ss
            ss = ''


with open('../data/arxiv-processed-pickle', 'rb') as f:
    import pickle

    dic = pickle.load(f)
for (sents, name) in tqdm(dic):
    to_insert = []
    for s in concat_sents(sents):
        to_insert.append({'text': s, 'meta': {'name': name}})
    document_store.write_documents(to_insert)
예제 #8
0
from transformers import AutoTokenizer, AutoModel

from definitions import from_root_dir
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from pulp.cy.dense.recv import TransformersEmbeddingRetriever


def to_meta_dict(meta: dict) -> dict:
    abs = meta['abstract'] if 'abstract' in meta.keys() else None
    if 'abstract' in meta:
        del meta['abstract']
    return {'text': abs, 'meta': meta}


store = ElasticsearchDocumentStore(refresh_type='false', index='meta')
retriever = TransformersEmbeddingRetriever(
    document_store=store,
    embedding_model=AutoModel.from_pretrained(
        from_root_dir('models/scibert_scivocab_uncased')),
    tokenizer=AutoTokenizer.from_pretrained(
        from_root_dir('models/scibert_scivocab_uncased')))
with open(from_root_dir('data/arxiv-metadata_pickle'), 'rb') as f:
    import pickle

    l = pickle.load(f)

store.write_documents((to_meta_dict(m) for m in l))
store.update_embeddings(retriever, index='meta')
# retriever.embed(['It is shown that, within a Ginzburg-Landau (GL) formalism, the\nsuperconducting fluctuation is insulating at zero temperature even if the\nfluctuation dynamics is metallic (dissipative). Based on this fact, the low\ntemperature behavior of the $H_{c2}$-line and the resistivity curves near a\nzero temperature transition are discussed. In particular, it is pointed out\nthat the neglect of quantum fluctuations in data analysis of the dc resistivity\nmay lead to an under-estimation of the $H_{c2}$ values near zero temperature.\n'])
# Let's first fetch some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=True)
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.

# Now, let's write the docs to our DB.
if LAUNCH_ELASTICSEARCH:
    document_store.write_documents(dicts)
else:
    logger.warning(
        "Since we already have a running ES instance we should not index the same documents again. \n"
        "If you still want to do this call: document_store.write_documents(dicts) manually "
    )

# ## Initalize Retriever, Reader,  & Finder
#
# ### Retriever
#
# Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
# could be answered.
#
# They use some simple but fast algorithm.
# **Here:** We use Elasticsearch's default BM25 algorithm
예제 #10
0
def tutorial1_basic_qa_pipeline():
    logger = logging.getLogger(__name__)

    LAUNCH_ELASTICSEARCH = True

    # ## Document Store
    #
    # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of
    # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`.
    #
    # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval,
    # and vector storage for text embeddings.
    # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3
    # for using SQL/InMemory document stores.
    # **Hint**:
    # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can
    # configure Haystack to work with your existing document stores.
    #
    # Start an Elasticsearch server
    # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
    # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.

    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # Connect to Elasticsearch
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add
    # them in Elasticsearch.

    # Let's first fetch some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    if LAUNCH_ELASTICSEARCH:
        document_store.write_documents(dicts)
    else:
        logger.warning(
            "Since we already have a running ES instance we should not index the same documents again. \n"
            "If you still want to do this call: document_store.write_documents(dicts) manually "
        )

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
    # could be answered.
    #
    # They use some simple but fast algorithm.
    # **Here:** We use Elasticsearch's default BM25 algorithm
    # **Alternatives:**
    # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
    # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of
    #   embeddings (e.g. created via Sentence-BERT)
    # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging

    retriever = ElasticsearchRetriever(document_store=document_store)

    # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes
    # with SQLite document store.
    #
    # from haystack.retriever.tfidf import TfidfRetriever
    # retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
    # **Here:** a medium sized RoBERTa QA model using a Reader based on
    #           FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean
    #           the model prefers "no answer possible"
    #
    # #### FARMReader

    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader

    # Alternative:
    # reader = TransformersReader(
    #    model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
예제 #11
0
        temp["meta"]["paper_id"] = data.find('aid').getText()
        temp["meta"]["doi"] = data.find('doi').getText()
        temp["meta"]["jid"] = data.find('jid').getText()

        paper_text = [
            t for t in data.find_all(text=True) if t.parent.name in target_tags
        ]
        temp["text"] = ''.join(paper_text)

        docs.append(temp)
    return docs


nf_docs = nf2020toDict()
logging.info("Indexing Elesier articals with full text")
document_store.write_documents(nf_docs)

# Elesier dataset only with abstract
ctf_hackathon_doc = pd.read_json('./data/ctf-hackathon-upload.json',
                                 lines=True)

with open('./data/HACKXML0000000004/dataset.xml', 'r') as f:
    papers_info = f.read()
papers_info_data = BeautifulSoup(papers_info, "xml")
paper_xml_doi_ls = np.unique(
    [t.getText() for t in papers_info_data.find_all('doi')])
other_nfpaper_docs = ctf_hackathon_doc[~ctf_hackathon_doc.doi.
                                       isin(paper_xml_doi_ls)].copy()

other_nfpaper_docs_dicts = []
for i, row in other_nfpaper_docs.iterrows():
def tutorial4_faq_style_qa():
    ## "FAQ-Style QA": Utilizing existing FAQs for Question Answering

    # While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data.
    #
    # Pros:
    # - Very fast at inference time
    # - Utilize existing FAQ data
    # - Quite good control over answers
    #
    # Cons:
    # - Generalizability: We can only answer questions that are similar to existing ones in FAQ
    #
    # In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option.
    LAUNCH_ELASTICSEARCH = False

    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(30)

    ### Init the DocumentStore
    # In contrast to Tutorial 1 (extractive QA), we:
    #
    # * specify the name of our `text_field` in Elasticsearch that we want to return as an answer
    # * specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question
    # * set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results

    document_store = ElasticsearchDocumentStore(
        host="localhost",
        username="",
        password="",
        index="document",
        embedding_field="question_emb",
        embedding_dim=768,
        excluded_meta_data=["question_emb"],
        similarity="cosine")

    ### Create a Retriever using embeddings
    # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).
    # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings.
    #
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   use_gpu=True)

    # Download a csv containing some FAQ data
    # Here: Some question-answer pairs related to COVID-19
    temp = requests.get(
        "https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv"
    )
    open('small_faq_covid.csv', 'wb').write(temp.content)

    # Get dataframe with columns "question", "answer" and some custom metadata
    df = pd.read_csv("small_faq_covid.csv")
    # Minimal cleaning
    df.fillna(value="", inplace=True)
    df["question"] = df["question"].apply(lambda x: x.strip())
    print(df.head())

    # Get embeddings for our questions from the FAQs
    questions = list(df["question"].values)
    df["question_emb"] = retriever.embed_queries(texts=questions)
    df = df.rename(columns={"question": "text"})

    # Convert Dataframe to list of dicts and index them in our DocumentStore
    docs_to_index = df.to_dict(orient="records")
    document_store.write_documents(docs_to_index)

    #    Initialize a Pipeline (this time without a reader) and ask questions

    from haystack.pipeline import FAQPipeline
    pipe = FAQPipeline(retriever=retriever)

    prediction = pipe.run(query="How is the virus spreading?",
                          top_k_retriever=10)
    print_answers(prediction, details="all")
예제 #13
0
# Download a csv containing some FAQ data
# Here: Some question-answer pairs related to COVID-19
temp = requests.get(
    "https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv"
)
open('small_faq_covid.csv', 'wb').write(temp.content)

# Get dataframe with columns "question", "answer" and some custom metadata
df = pd.read_csv("small_faq_covid.csv")
# Minimal cleaning
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
print(df.head())

# Get embeddings for our questions from the FAQs
questions = list(df["question"].values)
df["question_emb"] = retriever.embed_queries(texts=questions)
df["question_emb"] = df["question_emb"]
df = df.rename(columns={"answer": "text"})

# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.write_documents(docs_to_index)

# Init reader & and use Finder to get answer (same as in Tutorial 1)
finder = Finder(reader=None, retriever=retriever)
prediction = finder.get_answers_via_similar_questions(
    question="How is the virus spreading?", top_k_retriever=10)
print_answers(prediction, details="all")
예제 #14
0
    split_respect_sentence_boundary=False,
    split_overlap=0)

as4Docs = processor.process(as4)
# print(as4Docs)

for i in range(0, len(as4Docs)):
    print(i)
    print(":\n")
    print(as4Docs[i])
    print("---------------")

# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2

from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document")

document_store.delete_all_documents()
document_store.write_documents(as4Docs)

backagain = document_store.get_all_documents()

for i in range(0, len(backagain)):
    print(i)
    print(":\n")
    print(backagain[i])
    print("---------------")
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="passage",
    split_length=1,
    split_respect_sentence_boundary=False,
    split_overlap=0
)

as4Docs = processor.process(as4)

for i in range(len(as4Docs)):
    as4Docs[i]["meta"]["table"] = False

document_store_ES.delete_all_documents()
document_store_ES.write_documents(as4Docs)
document_store_FAISS.delete_all_documents()
document_store_FAISS.write_documents(as4Docs)

#backagain = document_store.get_all_documents();

for i in range(0, len(as4Docs)):
    print(str(i) + ":", end=" ")
    print(as4Docs[i])

# Update table content and table description - csv for table content, txt for table description
# The files are under the /tables directory, the name does not matter, as long as the csv and the txt match each other.
# Need to rerun this section when new table is uploaded.

import csv
import os