Exemplo n.º 1
0
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a SQuAD-style file.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in SQuAD format
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        for document in data["data"]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
            for paragraph in document["paragraphs"]:
                cur_meta = {"name": document["title"]}
                # all other fields from paragraph level
                meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
                cur_meta.update(meta_paragraph)
                # meta from parent document
                cur_meta.update(meta_doc)
                # Create Document
                cur_doc = Document(text=paragraph["context"], meta=cur_meta)
                docs.append(cur_doc)

                # Get Labels
                for qa in paragraph["qas"]:
                    if len(qa["answers"]) > 0:
                        for answer in qa["answers"]:
                            label = Label(
                                question=qa["question"],
                                answer=answer["text"],
                                is_correct_answer=True,
                                is_correct_document=True,
                                document_id=cur_doc.id,
                                offset_start_in_doc=answer["answer_start"],
                                no_answer=qa["is_impossible"],
                                origin="gold_label",
                            )
                            labels.append(label)
                    else:
                        label = Label(
                            question=qa["question"],
                            answer="",
                            is_correct_answer=True,
                            is_correct_document=True,
                            document_id=cur_doc.id,
                            offset_start_in_doc=0,
                            no_answer=qa["is_impossible"],
                            origin="gold_label",
                        )
                        labels.append(label)
        return docs, labels
Exemplo n.º 2
0
def orconvqa_read_files(filename: str, qrelsfile: str, buildCorpus: bool = False, corpusFile: str = ""):
    """
    Read and combine files from the OR-Quac dataset

    :param filename - Name of json file containing the questions, qids, answers and the question history
    :param qrelsfile - File in json format linking the qids to the doc ids of the golden passage (the passage where the answer can be found)
    :param buildCorpus - Whether or not the corpus should be build while parsing the questions file (requires corpus file)
    :param corpusFile - If buildCorpus is set, the function will load the documents from the corpus file as using the buildCorpus function

    :return: (List of Labels, None|List of Documents)
    """

    docs = None
    if buildCorpus:
        if not os.path.isfile(corpusFile):
            raise ValueError(f'Could not find corpus file: {corpusFile}')
        docs = orconvqa_build_corpus(corpusFile)

    with open(qrelsfile, 'r') as f:
        qrels = json.load(f)

    labels = []
    # filename = check_and_rewrite_file_path(filename)

    with open(filename, "r") as file:
        for question in file.readlines():
            question = orjson.loads(question)

            try:
                q_doc_rel = qrels[question['qid']]
            except:
                logger.warning(f'Qid {question["qid"]} not found in qrels, skipping question')

            if len(q_doc_rel.keys()) > 1:
                logger.warning('Found qrel with multiple docs, golden passage is unknown, assuming first')

            bla = q_doc_rel.keys()

            document_id = next(iter(q_doc_rel.keys()))

            label = Label(
                question=question["rewrite"],
                original_question=question['question'],
                answer=question["answer"]['text'],
                is_correct_answer=True,
                is_correct_document=True,
                # We do not do an extra check if the document id exists in the corpus, this may cause issues later
                document_id=document_id,
                offset_start_in_doc=question["answer"]['answer_start'],
                no_answer=question["answer"]['text'] == 'CANNOTANSWER',
                origin=filename,
                # TODO we do have some extra data here in the preprossed file -> pq['answer'], ['answer_start'] and ['bid']
                previous_questions_in_conversation=[pq['question'] for pq in question['history']]
            )
            labels.append(label)

    return labels, docs
Exemplo n.º 3
0
 def _convert_sql_row_to_label(self, row) -> Label:
     label = Label(
         document_id=row.document_id,
         no_answer=row.no_answer,
         origin=row.origin,
         question=row.question,
         is_correct_answer=row.is_correct_answer,
         is_correct_document=row.is_correct_document,
         answer=row.answer,
         offset_start_in_doc=row.offset_start_in_doc,
         model_id=row.model_id,
     )
     return label
Exemplo n.º 4
0
    def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None):
        index = index or self.label_index
        if index and not self.client.indices.exists(index=index):
            self._create_label_index(index)

        # Make sure we comply to Label class format
        label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]

        labels_to_index = []
        for label in label_objects:
            _label = {
                "_op_type": "index" if self.update_existing_documents else "create",
                "_index": index,
                **label.to_dict()
            }  # type: Dict[str, Any]

            labels_to_index.append(_label)
        bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type)
Exemplo n.º 5
0
    def write_labels(self, labels, index=None):

        labels = [
            Label.from_dict(l) if isinstance(l, dict) else l for l in labels
        ]
        index = index or self.label_index
        for label in labels:
            label_orm = LabelORM(
                document_id=label.document_id,
                no_answer=label.no_answer,
                origin=label.origin,
                question=label.question,
                is_correct_answer=label.is_correct_answer,
                is_correct_document=label.is_correct_document,
                answer=label.answer,
                offset_start_in_doc=label.offset_start_in_doc,
                model_id=label.model_id,
                index=index,
            )
            self.session.add(label_orm)
        self.session.commit()
Exemplo n.º 6
0
def CoQA_read_file(filename: str) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a CoQA style file
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.

    :param filename: Path to file in CoQA format
    :return: (List of Documents, List of Labels)
    """
    docs = []
    labels = []

    with open(filename, "r") as file:
        data = json.load(file)
        for document in data["data"]:
            # get all extra fields from document level (e.g. title)
            meta_doc = {k: v for k, v in document.items() if k not in ("questions", "answers")}
            cur_doc = Document(id=document['id'], text=document["story"], meta=meta_doc)

            docs.append(cur_doc)
            # Get Labels
            for q, a in zip(document["questions"], document['answers']):

                label = Label(
                    question=q["input_text"],
                    # TODO these are very short answers and may not allways match with the span_start
                    # The retrieved answer on span_text is longer and input_text is taken from that
                    answer=a['input_text'],
                    is_correct_answer=True,
                    is_correct_document=True,
                    # We do not do an extra check if the document id exists in the corpus, this may cause issues later
                    document_id=cur_doc.id,
                    offset_start_in_doc=a["span_start"],
                    origin=filename,
                    previous_questions_in_conversation=[pq['input_text'] for pq in document['questions'] if pq['turn_id'] < q['turn_id']]
                )
                labels.append(label)

    return docs, labels
Exemplo n.º 7
0
 def get_all_labels(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
     index = index or self.label_index
     result = self.get_all_documents_in_index(index=index, filters=filters)
     labels = [Label.from_dict(hit["_source"]) for hit in result]
     return labels