def _extract_docs_and_labels_from_dict(document_dict: Dict): docs = [] labels = [] # get all extra fields from document level (e.g. title) meta_doc = { k: v for k, v in document_dict.items() if k not in ("paragraphs", "title") } for paragraph in document_dict["paragraphs"]: cur_meta = {"name": document_dict.get("title", None)} # all other fields from paragraph level meta_paragraph = { k: v for k, v in paragraph.items() if k not in ("qas", "context") } cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) # Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) docs.append(cur_doc) # Get Labels for qa in paragraph["qas"]: if len(qa["answers"]) > 0: for answer in qa["answers"]: label = Label( question=qa["question"], answer=answer["text"], is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=answer["answer_start"], no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) else: label = Label( question=qa["question"], answer="", is_correct_answer=True, is_correct_document=True, document_id=cur_doc.id, offset_start_in_doc=0, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) return docs, labels
def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None): """Write annotation labels into document store.""" index = index or self.label_index label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels] for label in label_objects: label_id = str(uuid4()) self.indexes[index][label_id] = label
def get_all_labels( self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]: """ Return all labels in the document store """ index = index or self.label_index result = self.get_all_documents_in_index(index=index, filters=filters) labels = [Label.from_dict(hit["_source"]) for hit in result] return labels
def _convert_sql_row_to_label(self, row) -> Label: label = Label( document_id=row.document_id, no_answer=row.no_answer, origin=row.origin, question=row.question, is_correct_answer=row.is_correct_answer, is_correct_document=row.is_correct_document, answer=row.answer, offset_start_in_doc=row.offset_start_in_doc, model_id=row.model_id, ) return label
def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None, batch_size: Optional[int] = None): """Write annotation labels into document store. :param labels: A list of Python dictionaries or a list of Haystack Label objects. :param batch_size: Number of labels that are passed to Elasticsearch's bulk function at a time. If `None`, all labels will be passed to bulk at once. """ index = index or self.label_index if index and not self.client.indices.exists(index=index): self._create_label_index(index) labels_to_index = [] for l in labels: # Make sure we comply to Label class format if isinstance(l, dict): label = Label.from_dict(l) else: label = l _label = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **label.to_dict() } # type: Dict[str, Any] # rename id for elastic if label.id is not None: _label["_id"] = str(_label.pop("id")) labels_to_index.append(_label) if batch_size is not None: # Pass batch_size number of labels to bulk if len(labels_to_index) % batch_size == 0: bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type) labels_to_index = [] if labels_to_index: bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type)
def write_labels(self, labels, index=None): """Write annotation labels into document store.""" labels = [ Label.from_dict(l) if isinstance(l, dict) else l for l in labels ] index = index or self.label_index # TODO: Use batch_size for label in labels: label_orm = LabelORM( document_id=label.document_id, no_answer=label.no_answer, origin=label.origin, question=label.question, is_correct_answer=label.is_correct_answer, is_correct_document=label.is_correct_document, answer=label.answer, offset_start_in_doc=label.offset_start_in_doc, model_id=label.model_id, index=index, ) self.session.add(label_orm) self.session.commit()