def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label", batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None, max_docs: Union[int, bool] = None): """ Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it. If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors. :param filename: Name of the file containing evaluation data (json or jsonl) :param doc_index: Elasticsearch index where evaluation documents should be stored :param label_index: Elasticsearch index where labeled questions should be stored :param batch_size: Optional number of documents that are loaded and processed at a time. When set to None (default) all documents are processed at once. :param preprocessor: Optional PreProcessor to preprocess evaluation documents. It can be used for splitting documents into passages (and assigning labels to corresponding passages). Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0. When set to None (default) preprocessing is disabled. :param max_docs: Optional number of documents that will be loaded. When set to None (default) all available eval documents are used. """ # TODO improve support for PreProcessor when adding eval data if preprocessor is not None: assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \ f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor." assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \ f"Please set 'split_overlap=0' in the supplied PreProcessor." assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \ f"Please set 'clean_empty_lines=False' in the supplied PreProcessor." assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \ f"Please set 'clean_whitespace=False' in the supplied PreProcessor." assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \ f"Please set 'clean_header_footer=False' in the supplied PreProcessor." file_path = Path(filename) if file_path.suffix == ".json": if batch_size is None: docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor) self.write_documents(docs, index=doc_index) self.write_labels(labels, index=label_index) else: jsonl_filename = (file_path.parent / (file_path.stem + '.jsonl')).as_posix() logger.info(f"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. " f"Converting json to jsonl to: {jsonl_filename}") squad_json_to_jsonl(filename, jsonl_filename) self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size) elif file_path.suffix == ".jsonl": for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor): if docs: self.write_documents(docs, index=doc_index) if labels: self.write_labels(labels, index=label_index) else: logger.error("File needs to be in json or jsonl format.")
def prepare_data(data_dir, filename_gold, filename_negative, remote_url, embeddings_filenames, embeddings_dir, n_docs=None, n_queries=None, add_precomputed=False): """ filename_gold points to a squad format file. filename_negative points to a csv file where the first column is doc_id and second is document text. If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document """ logging.getLogger("farm").setLevel(logging.INFO) download_from_url(remote_url + filename_gold, filepath=data_dir + filename_gold) download_from_url(remote_url + filename_negative, filepath=data_dir + filename_negative) if add_precomputed: for embedding_filename in embeddings_filenames: download_from_url( remote_url + str(embeddings_dir) + embedding_filename, filepath=data_dir + str(embeddings_dir) + embedding_filename) logging.getLogger("farm").setLevel(logging.WARN) gold_docs, labels = eval_data_from_json(data_dir + filename_gold) # Reduce number of docs gold_docs = gold_docs[:n_docs] # Remove labels whose gold docs have been removed doc_ids = [x.id for x in gold_docs] labels = [x for x in labels if x.document_id in doc_ids] # Filter labels down to n_queries selected_queries = list( set(f"{x.document_id} | {x.question}" for x in labels)) selected_queries = selected_queries[:n_queries] labels = [ x for x in labels if f"{x.document_id} | {x.question}" in selected_queries ] n_neg_docs = max(0, n_docs - len(gold_docs)) neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs) docs = gold_docs + neg_docs if add_precomputed: docs = add_precomputed_embeddings(data_dir + embeddings_dir, embeddings_filenames, docs) return docs, labels
def benchmark_reader(ci=False, update_json=False, save_markdown=False, **kwargs): if ci: reader_models = reader_models_ci else: reader_models = reader_models_full reader_results = [] doc_store = get_document_store("elasticsearch") # download squad data _download_extract_downstream_data(input_file=data_dir / filename) docs, labels = eval_data_from_json(data_dir / filename, max_docs=None) index_to_doc_store(doc_store, docs, None, labels) for reader_name in reader_models: for reader_type in reader_types: logger.info( f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### " ) try: reader = get_reader(reader_name, reader_type) results = reader.eval(document_store=doc_store, doc_index=doc_index, label_index=label_index, device="cuda") # print(results) results["passages_per_second"] = n_total_passages / results[ "reader_time"] results["reader"] = reader_name results["error"] = "" reader_results.append(results) except Exception as e: results = { 'EM': 0., 'f1': 0., 'top_n_accuracy': 0., 'top_n': 0, 'reader_time': 0., "passages_per_second": 0., "seconds_per_query": 0., 'reader': reader_name, "error": e } reader_results.append(results) reader_df = pd.DataFrame.from_records(reader_results) reader_df.to_csv(results_file) if save_markdown: md_file = results_file.replace(".csv", ".md") with open(md_file, "w") as f: f.write(str(reader_df.to_markdown())) doc_store.delete_all_documents(label_index) doc_store.delete_all_documents(doc_index) if update_json: populate_reader_json()
def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label", batch_size: Optional[int] = None): """ Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it. If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors. :param filename: Name of the file containing evaluation data (json or jsonl) :type filename: str :param doc_index: Elasticsearch index where evaluation documents should be stored :type doc_index: str :param label_index: Elasticsearch index where labeled questions should be stored :type label_index: str :param batch_size: Number of documents that are loaded and processed at a time. Only works with jsonl formatted files. Setting batch_size and using a json formatted file will convert the json to jsonl prior to adding eval data. :type batch_size: int """ if filename.endswith(".json"): if batch_size is None: docs, labels = eval_data_from_json(filename) self.write_documents(docs, index=doc_index) self.write_labels(labels, index=label_index) else: jsonl_filename = filename + "l" logger.info( f"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. " f"Converting json to jsonl to: {jsonl_filename}") squad_json_to_jsonl(filename, jsonl_filename) self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size) elif filename.endswith(".jsonl"): for docs, labels in eval_data_from_jsonl(filename, batch_size): if docs: self.write_documents(docs, index=doc_index) if labels: self.write_labels(labels, index=label_index) else: logger.error("File needs to be in json or jsonl format.")