示例#1
0
def _extract_docs_and_labels_from_dict(document_dict: Dict):
    docs = []
    labels = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {
        k: v
        for k, v in document_dict.items() if k not in ("paragraphs", "title")
    }
    for paragraph in document_dict["paragraphs"]:
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {
            k: v
            for k, v in paragraph.items() if k not in ("qas", "context")
        }
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)
        # Create Document
        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
        docs.append(cur_doc)

        # Get Labels
        for qa in paragraph["qas"]:
            if len(qa["answers"]) > 0:
                for answer in qa["answers"]:
                    label = Label(
                        question=qa["question"],
                        answer=answer["text"],
                        is_correct_answer=True,
                        is_correct_document=True,
                        document_id=cur_doc.id,
                        offset_start_in_doc=answer["answer_start"],
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
                label = Label(
                    question=qa["question"],
                    answer="",
                    is_correct_answer=True,
                    is_correct_document=True,
                    document_id=cur_doc.id,
                    offset_start_in_doc=0,
                    no_answer=qa["is_impossible"],
                    origin="gold_label",
                )
                labels.append(label)

    return docs, labels
示例#2
0
    def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
        """Write annotation labels into document store."""
        index = index or self.label_index
        label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]

        for label in label_objects:
            label_id = str(uuid4())
            self.indexes[index][label_id] = label
示例#3
0
 def get_all_labels(
         self,
         index: Optional[str] = None,
         filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
     """
     Return all labels in the document store
     """
     index = index or self.label_index
     result = self.get_all_documents_in_index(index=index, filters=filters)
     labels = [Label.from_dict(hit["_source"]) for hit in result]
     return labels
示例#4
0
文件: sql.py 项目: qtuter1997/topdup
 def _convert_sql_row_to_label(self, row) -> Label:
     label = Label(
         document_id=row.document_id,
         no_answer=row.no_answer,
         origin=row.origin,
         question=row.question,
         is_correct_answer=row.is_correct_answer,
         is_correct_document=row.is_correct_document,
         answer=row.answer,
         offset_start_in_doc=row.offset_start_in_doc,
         model_id=row.model_id,
     )
     return label
示例#5
0
    def write_labels(self,
                     labels: Union[List[Label], List[dict]],
                     index: Optional[str] = None,
                     batch_size: Optional[int] = None):
        """Write annotation labels into document store.

        :param labels: A list of Python dictionaries or a list of Haystack Label objects.
        :param batch_size: Number of labels that are passed to Elasticsearch's bulk function at a time.
                           If `None`, all labels will be passed to bulk at once.
        """
        index = index or self.label_index
        if index and not self.client.indices.exists(index=index):
            self._create_label_index(index)

        labels_to_index = []
        for l in labels:
            # Make sure we comply to Label class format
            if isinstance(l, dict):
                label = Label.from_dict(l)
            else:
                label = l

            _label = {
                "_op_type":
                "index" if self.update_existing_documents else "create",
                "_index": index,
                **label.to_dict()
            }  # type: Dict[str, Any]

            # rename id for elastic
            if label.id is not None:
                _label["_id"] = str(_label.pop("id"))

            labels_to_index.append(_label)

            if batch_size is not None:
                # Pass batch_size number of labels to bulk
                if len(labels_to_index) % batch_size == 0:
                    bulk(self.client,
                         labels_to_index,
                         request_timeout=300,
                         refresh=self.refresh_type)
                    labels_to_index = []

        if labels_to_index:
            bulk(self.client,
                 labels_to_index,
                 request_timeout=300,
                 refresh=self.refresh_type)
示例#6
0
文件: sql.py 项目: qtuter1997/topdup
    def write_labels(self, labels, index=None):
        """Write annotation labels into document store."""

        labels = [
            Label.from_dict(l) if isinstance(l, dict) else l for l in labels
        ]
        index = index or self.label_index
        # TODO: Use batch_size
        for label in labels:
            label_orm = LabelORM(
                document_id=label.document_id,
                no_answer=label.no_answer,
                origin=label.origin,
                question=label.question,
                is_correct_answer=label.is_correct_answer,
                is_correct_document=label.is_correct_document,
                answer=label.answer,
                offset_start_in_doc=label.offset_start_in_doc,
                model_id=label.model_id,
                index=index,
            )
            self.session.add(label_orm)
        self.session.commit()