예제 #1
0
# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
doc_dir = "../data/nq"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document",
                                            create_index=False, embedding_field="emb",
                                            embedding_dim=768, excluded_meta_data=["emb"])


# Add evaluation data to Elasticsearch database
# We first delete the custom tutorial indices to not have duplicate elements
document_store.delete_all_documents(index=doc_index)
document_store.delete_all_documents(index=label_index)
document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index)


# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Alternative: Evaluate DensePassageRetriever
# Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.
# Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!).
# DPR still outperforms Elastic's BM25 by a small margin here.

# from haystack.retriever.dense import DensePassageRetriever
# retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq",batch_size=32)
# document_store.update_embeddings(retriever, index="eval_document")

예제 #2
0
    )
    if status.returncode:
        raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                        "then set LAUNCH_ELASTICSEARCH in the script to False.")
    time.sleep(30)

# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
doc_dir = "../data/nq"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False)
# Add evaluation data to Elasticsearch database
if LAUNCH_ELASTICSEARCH:
    document_store.add_eval_data("../data/nq/nq_dev_subset.json")
else:
    logger.warning("Since we already have a running ES instance we should not index the same documents again."
                   "If you still want to do this call: 'document_store.add_eval_data('../data/nq/nq_dev_subset.json')' manually ")

# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Initialize Reader
reader = FARMReader("deepset/roberta-base-squad2")

# Initialize Finder which sticks together Reader and Retriever
finder = Finder(reader, retriever)


## Evaluate Retriever on its own
예제 #3
0
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document",
                                            create_index=False,
                                            embedding_field="emb",
                                            embedding_dim=768,
                                            excluded_meta_data=["emb"])

# Add evaluation data to Elasticsearch database
if LAUNCH_ELASTICSEARCH:
    document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json",
                                 doc_index="eval_document",
                                 label_index="feedback")
else:
    logger.warning(
        "Since we already have a running ES instance we should not index the same documents again."
        "If you still want to do this call: 'document_store.add_eval_data('../data/nq/nq_dev_subset_v2.json')' manually "
    )

# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Alternative: Evaluate DensePassageRetriever
# Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.
# Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!).
# DPR still outperforms Elastic's BM25 by a small margin here.