Exemplo n.º 1
0
def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding="utf-8") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text,
                                cleaned_answer_text,
                            )
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=is_impossible,
                )
                examples.append(example)
    return examples
Exemplo n.º 2
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path, cache_dir="data/squad")
        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json["data"]
            logger.info("Reading the dataset")
            for article in dataset[:1]:
                for paragraph_json in article["paragraphs"][:1]:
                    paragraph_text = paragraph_json["context"]
                    # white_space tokenization
                    paragraph_words = []
                    char_to_word_offset = []
                    prev_is_whitespace = True
                    for c in paragraph_text:
                        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                                c) == 0x202F:
                            prev_is_whitespace = True
                        else:
                            if prev_is_whitespace:
                                paragraph_words.append(c)
                            else:
                                paragraph_words[-1] += c
                            prev_is_whitespace = False

                        char_to_word_offset.append(len(paragraph_words) - 1)
                    # calc answer span
                    for question_answer in paragraph_json["qas"]:
                        question_id = question_answer["id"]
                        question_text = question_answer["question"].strip(
                        ).replace("\n", "")
                        # Here, since the givened answer is same, we just choose the first answer just like most does
                        answer = question_answer["answers"][0]
                        answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            paragraph_words[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue
                        # convert to Instance
                        additional_metadata = {"id": question_id}
                        instance = self.text_to_instance(
                            question_text, paragraph_text, paragraph_words,
                            answer_text, start_position, end_position,
                            additional_metadata)
                        if instance is not None:
                            yield instance
 def tokenize(self, text):
     return whitespace_tokenize(text.lower())
def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a ReCoRD json file into a list of RecordExample."""
    reader = open(input_file, "r", encoding='utf-8')

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for line in reader:
        raw = json.loads(line)
        source = raw['source']
        paragraph_text = raw['passage']['text']
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        for qa in raw['qas']:
            qas_id = qa['id']
            question_text = qa['query']
            start_position = None
            end_position = None
            orig_answer_text = None
            if is_training:
                answers = qa["answers"]
                orig_answer_texts = [answer["text"] for answer in answers]
                answer_offsets = [answer["answer_start"] for answer in answers]
                answer_lengths = [
                    len(orig_answer_text)
                    for orig_answer_text in orig_answer_texts
                ]
                start_positions = [
                    char_to_word_offset[answer_offset]
                    for answer_offset in answer_offsets
                ]
                end_positions = [
                    char_to_word_offset[answer_offset + answer_length - 1]
                    for answer_offset, answer_length in zip(
                        answer_offsets, answer_lengths)
                ]
                # Only add answers where the text can be exactly recovered from the
                # document. If this CAN'T happen it's likely due to weird Unicode
                # stuff so we will just skip the example.
                #
                # Note that this means for training mode, every example is NOT
                # guaranteed to be preserved.
                actual_texts = [
                    " ".join(doc_tokens[start_position:(end_position + 1)])
                ]
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    logger.warning("Could not find answer: '%s' vs. '%s'",
                                   actual_text, cleaned_answer_text)
                    continue
            example = RecordExample(qas_id=qas_id,
                                    question_text=question_text,
                                    doc_tokens=doc_tokens,
                                    orig_answer_text=orig_answer_text,
                                    start_position=start_position,
                                    end_position=end_position)
        examples.append(example)

    reader.close()
    return examples
Exemplo n.º 5
0
def create_samples_squad(entry):
    """Read a SQuAD json file into a list of SquadExample."""

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    try:
        _ = entry["paragraphs"][0]["qas"][0]["is_impossible"]
        is_training = True
    except KeyError:
        is_training = False

    examples = []
    num_examples = 1
    for paragraph in entry["paragraphs"]:
        paragraph_text = paragraph["context"]

        char_to_word_offset = []
        doc_tokens = paragraph_text.split(" ")
        for i, t in enumerate(doc_tokens):
            char_to_word_offset.extend([i] * (len(t) + 1))
        char_to_word_offset = char_to_word_offset[:-1]  # cut off last added whitespace

        for qa in paragraph["qas"]:
            qas_id = qa["id"]
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            if is_training:
                is_impossible = qa["is_impossible"]
                # TODO check how to transform dev set with multiple possible answers, for now take only 1 answer
                # if (len(qa["answers"]) != 1) and (not is_impossible):
                #     raise ValueError(
                #         "For training, each question should have exactly 1 answer."
                #     )
                if not is_impossible:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[
                        answer_offset + answer_length - 1
                    ]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(
                        doc_tokens[start_position : (end_position + 1)]
                    )
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text)
                    )
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.warning(
                            "Could not find answer: '%s' vs. '%s'",
                            actual_text,
                            cleaned_answer_text,
                        )
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""

            clear_text = {}
            clear_text["qas_id"] = qas_id
            clear_text["question_text"] = question_text
            clear_text["doc_tokens"] = doc_tokens
            clear_text["orig_answer_text"] = orig_answer_text
            clear_text["start_position"] = start_position
            clear_text["end_position"] = end_position
            clear_text["is_impossible"] = is_impossible
            clear_text["is_training"] = is_training
            example = Sample(
                id=None, clear_text=clear_text, features=None, tokenized=None
            )
            num_examples += 1
            examples.append(example)
    return examples