def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None): index = index or self.label_index if index and not self.client.indices.exists(index=index): self._create_label_index(index) # Make sure we comply to Label class format label_objects = [ Label.from_dict(l) if isinstance(l, dict) else l for l in labels ] labels_to_index = [] for label in label_objects: _label = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **label.to_dict() } # type: Dict[str, Any] labels_to_index.append(_label) bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type)
def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None): """Write annotation labels into document store.""" index = index or self.label_index label_objects = [ Label.from_dict(l) if isinstance(l, dict) else l for l in labels ] duplicate_ids: list = [ label.id for label in self._get_duplicate_labels(label_objects, index=index) ] if len(duplicate_ids) > 0: logger.warning( f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store." f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of" f" the answer annotation and not the question." f" Problematic ids: {','.join(duplicate_ids)}") for label in label_objects: # create timestamps if not available yet if not label.created_at: label.created_at = time.strftime("%Y-%m-%d %H:%M:%S") if not label.updated_at: label.updated_at = label.created_at self.indexes[index][label.id] = label
def get_all_labels( self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]: index = index or self.label_index result = self.get_all_documents_in_index(index=index, filters=filters) labels = [Label.from_dict(hit["_source"]) for hit in result] return labels
def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None): """Write annotation labels into document store.""" index = index or self.label_index label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels] for label in label_objects: label_id = str(uuid4()) self.indexes[index][label_id] = label
def get_all_labels( self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, batch_size: int = 10_000 ) -> List[Label]: """ Return all labels in the document store """ index = index or self.label_index result = list(self._get_all_documents_in_index(index=index, filters=filters, batch_size=batch_size)) labels = [Label.from_dict(hit["_source"]) for hit in result] return labels
def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None, batch_size: int = 10_000): """Write annotation labels into document store. :param labels: A list of Python dictionaries or a list of Haystack Label objects. :param batch_size: Number of labels that are passed to Elasticsearch's bulk function at a time. """ index = index or self.label_index if index and not self.client.indices.exists(index=index): self._create_label_index(index) labels_to_index = [] for l in labels: # Make sure we comply to Label class format if isinstance(l, dict): label = Label.from_dict(l) else: label = l # create timestamps if not available yet if not label.created_at: label.created_at = time.strftime("%Y-%m-%d %H:%M:%S") if not label.updated_at: label.updated_at = label.created_at _label = { "_op_type": "index" if self.update_existing_documents else "create", "_index": index, **label.to_dict() } # type: Dict[str, Any] # rename id for elastic if label.id is not None: _label["_id"] = str(_label.pop("id")) labels_to_index.append(_label) # Pass batch_size number of labels to bulk if len(labels_to_index) % batch_size == 0: bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type) labels_to_index = [] if labels_to_index: bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type)
def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None): """Write annotation labels into document store.""" index = index or self.label_index label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels] for label in label_objects: label_id = str(uuid4()) # create timestamps if not available yet if not label.created_at: label.created_at = time.strftime("%Y-%m-%d %H:%M:%S") if not label.updated_at: label.updated_at = label.created_at self.indexes[index][label_id] = label
def write_labels(self, labels, index=None): labels = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels] index = index or self.label_index for label in labels: label_orm = LabelORM( document_id=label.document_id, no_answer=label.no_answer, origin=label.origin, question=label.question, is_correct_answer=label.is_correct_answer, is_correct_document=label.is_correct_document, answer=label.answer, offset_start_in_doc=label.offset_start_in_doc, model_id=label.model_id, index=index, ) self.session.add(label_orm) self.session.commit()
def write_labels(self, labels, index=None): """Write annotation labels into document store.""" labels = [ Label.from_dict(l) if isinstance(l, dict) else l for l in labels ] index = index or self.label_index duplicate_ids: list = [ label.id for label in self._get_duplicate_labels(labels, index=index) ] if len(duplicate_ids) > 0: logger.warning( f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store." f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of" f" the answer annotation and not the question." f" Problematic ids: {','.join(duplicate_ids)}") # TODO: Use batch_size for label in labels: label_orm = LabelORM( id=label.id, document_id=label.document_id, no_answer=label.no_answer, origin=label.origin, question=label.question, is_correct_answer=label.is_correct_answer, is_correct_document=label.is_correct_document, answer=label.answer, offset_start_in_doc=label.offset_start_in_doc, model_id=label.model_id, index=index, ) if label.id in duplicate_ids: self.session.merge(label_orm) else: self.session.add(label_orm) self.session.commit()