示例#1
0
    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
        """_convert_example_to_record"""
        tokens = tokenization.whitespace_tokenize(example.text_a)
        labels = tokenization.whitespace_tokenize(example.label)
        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)

        if len(tokens) > max_seq_length - 2:
            tokens = tokens[0:(max_seq_length - 2)]
            labels = labels[0:(max_seq_length - 2)]

        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        position_ids = list(range(len(token_ids)))
        text_type_ids = [0] * len(token_ids)
        no_entity_id = len(self.label_map) - 1
        label_ids = [no_entity_id
                     ] + [self.label_map[label]
                          for label in labels] + [no_entity_id]

        Record = namedtuple(
            'Record',
            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
        record = Record(token_ids=token_ids,
                        text_type_ids=text_type_ids,
                        position_ids=position_ids,
                        label_ids=label_ids)
        return record
示例#2
0
def read_squad_example(entry, is_training):
    examples = []
    for paragraph in entry["paragraphs"]:
        paragraph_text = paragraph["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        for qa in paragraph["qas"]:
            qas_id = qa["id"]
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None
            if is_training:
                if len(qa["answers"]) != 1:
                    raise ValueError(
                        "For training, each question should have exactly 1 answer."
                    )
                answer = qa["answers"][0]
                orig_answer_text = answer["text"]
                answer_offset = answer["answer_start"]
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset +
                                                   answer_length - 1]
                # Only add answers where the text can be exactly recovered from the
                # document. If this CAN'T happen it's likely due to weird Unicode
                # stuff so we will just skip the example.
                #
                # Note that this means for training mode, every example is NOT
                # guaranteed to be preserved.
                actual_text = " ".join(
                    doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    tokenization.whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    logger.warning("Could not find answer: '%s' vs. '%s'",
                                   actual_text, cleaned_answer_text)
                    continue

            example = SquadExample(qas_id=qas_id,
                                   question_text=question_text,
                                   doc_tokens=doc_tokens,
                                   orig_answer_text=orig_answer_text,
                                   start_position=start_position,
                                   end_position=end_position)
            examples.append(example)
    return examples
示例#3
0
 def tokenize(self, text):
     text = [convert_to_unicode(a) for a in text]
     text2 = []
     for token in text:
         text2 += self._clean_text(token)
     split_tokens = []
     for token in text2:
         if self.do_lower_case:
             token = token.lower()
             token = self._run_strip_accents(token)
         split_tokens.append(token)
     output_tokens = whitespace_tokenize(" ".join(split_tokens))
     return output_tokens
def read_squad_examples(id, paragraph, question, tokenizer):

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    def is_control(char):
        """Checks whether `chars` is a control character."""
        # These are technically control characters but we count them as whitespace
        # characters.
        if char == "\t" or char == "\n" or char == "\r":
            return False
        cat = unicodedata.category(char)
        if cat.startswith("C"):
            return True
        return False

    def clean_text(text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or is_control(char):
                continue
            if is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    examples = []
    paragraph_text = " ".join(tokenization.whitespace_tokenize(clean_text(paragraph)))
    doc_tokens = tokenizer.basic_tokenizer.tokenize(paragraph_text)
    qas_id = id
    question_text = question
    start_position = None
    end_position = None
    orig_answer_text = None
    example = SquadExample(
        qas_id=qas_id,
        question_text=question_text,
        doc_tokens=doc_tokens,
        orig_answer_text=orig_answer_text,
        start_position=start_position,
        end_position=end_position)
    examples.append(example)
    return examples
示例#5
0
 def can_find(text, offset, length, tokens, char_to_word_offset):
     print(char_to_word_offset)
     start_position = char_to_word_offset[offset]
     end_position = char_to_word_offset[offset + length - 1]
     # Only add answers where the text can be exactly recovered from the
     # document. If this CAN'T happen it's likely due to weird Unicode
     # stuff so we will just skip the example.
     #
     # Note that this means for training mode, every example is NOT
     # guaranteed to be preserved.
     actual_text = " ".join(tokens[start_position:(end_position + 1)])
     cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(text))
     if actual_text.find(cleaned_answer_text) == -1:
         tf.logging.warning("Could not find answer: '%s' vs. '%s'",actual_text, cleaned_answer_text)
         return None,None
     return start_position,end_position
示例#6
0
def detect_span(_answers, context, doc_tokens, char_to_word_offset):
    orig_answer_texts = []
    start_positions = []
    end_positions = []
    switches = []

    answers = []
    for answer in _answers:
        answers += find_span_from_text(context, doc_tokens, answer['text'])

    for answer in answers:
        orig_answer_text = answer["text"]
        answer_offset = answer["answer_start"]
        answer_length = len(orig_answer_text)

        switch = 0
        if 'word_start' in answer and 'word_end' in answer:
            start_position = answer['word_start']
            end_position = answer['word_end']
        else:
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length -
                                               1]
        # Only add answers where the text can be exactly recovered from the
        # document. If this CAN'T happen it's likely due to weird Unicode
        # stuff so we will just skip the example.
        #
        # Note that this means for training mode, every example is NOT
        # guaranteed to be preserved.
        actual_text = " ".join(
            doc_tokens[start_position:(end_position + 1)]).replace(
                ' ##', '').replace('##', '')
        cleaned_answer_text = " ".join(
            tokenization.whitespace_tokenize(orig_answer_text))
        if actual_text.replace(' ',
                               '').find(cleaned_answer_text.replace(' ',
                                                                    '')) == -1:
            print("Could not find answer: '%s' vs. '%s'" %
                  (actual_text, cleaned_answer_text))

        orig_answer_texts.append(orig_answer_text)
        start_positions.append(start_position)
        end_positions.append(end_position)
        switches.append(switch)

    return orig_answer_texts, switches, start_positions, end_positions
示例#7
0
def read_record_examples(input_file, is_training):
    """Read a ReCoRD json file into a list of ReCoRDExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        paragraph_text = entry["passage"]["text"].replace('\xa0', ' ')
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        # load entities in passage
        passage_entities = []
        for entity in entry['passage']['entities']:
            entity_start_offset = entity['start']
            entity_end_offset = entity['end']
            if entity_end_offset < entity_start_offset: # some error labeled entities in record dataset
                continue
            entity_text = paragraph_text[entity_start_offset: entity_end_offset + 1]
            passage_entities.append({'orig_text': entity_text, 
                                     'start_position': char_to_word_offset[entity_start_offset], 
                                     'end_position': char_to_word_offset[entity_end_offset]})            

        for qa in entry["qas"]:
            qas_id = qa["id"]
            question_text = qa["query"].replace('\xa0', ' ')
            start_position = None
            end_position = None
            orig_answer_text = None
            if is_training:
                # if len(qa["answers"]) != 1:
                #     raise ValueError(
                #         "For training, each question should have exactly 1 answer.")
                answer = qa["answers"][0]
                orig_answer_text = answer["text"]
                answer_offset = answer["start"]
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset + answer_length - 1]
                # Only add answers where the text can be exactly recovered from the
                # document. If this CAN'T happen it's likely due to weird Unicode
                # stuff so we will just skip the example.
                #
                # Note that this means for training mode, every example is NOT
                # guaranteed to be preserved.
                actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    tokenization.whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    logger.warning("Could not find answer: '%s' vs. '%s'",
                                        actual_text, cleaned_answer_text)
                    continue

            example = ReCoRDExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                passage_entities=passage_entities,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position)
            examples.append(example)
    return examples
示例#8
0
def read_squad_examples(input_file, return_answers, context_only=False, question_only=False,
                        draft=False, draft_num_examples=12, append_title=False):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    examples = []
    ans_cnt = 0
    no_ans_cnt = 0

    # Only word-based tokenization is peformed (whitespace based)
    for doc_idx, entry in enumerate(input_data):
        title = entry['title'][0] if type(entry['title']) == list else entry['title']
        assert type(title) == str

        for par_idx, paragraph in enumerate(entry["paragraphs"]):
            # Do not load context for question only
            if not question_only:
                paragraph_text = paragraph["context"]
                title_offset = 0
                if append_title:
                    title_str = '[ ' + ' '.join(title.split('_')) + ' ] '
                    title_offset += len(title_str)
                    paragraph_text = title_str + paragraph_text
                # Note that we use the term 'word' for whitespace based words, and 'token' for subtokens (for BERT input)
                doc_words, char_to_word_offset = context_to_words_and_offset(paragraph_text)

            # 1) Context only ends here
            if context_only:
                metadata = {}
                if "pubmed_id" in entry:
                    entry_keys = [
                        "pubmed_id", "sha", "title_original", "title_entities",
                        "journal", "authors", "article_idx"
                    ]
                    para_keys = ["context_entities"]
                    for entry_key in entry_keys:
                        if entry_key in entry:
                            metadata[entry_key] = entry[entry_key]
                    for para_key in para_keys:
                        if para_key in paragraph:
                            metadata[para_key] = paragraph[para_key]
                    # metadata["pubmed_id"] = (metadata["pubmed_id"] if not pd.isnull(metadata["pubmed_id"])
                    #     else 'NaN')
                example = SquadExample(
                    doc_words=doc_words,
                    title=title,
                    doc_idx=doc_idx,
                    par_idx=par_idx,
                    metadata=metadata)
                examples.append(example)

                if draft and len(examples) == draft_num_examples:
                    return examples
                continue

            # 2) Question only or 3) context/question pair
            else:
                for qa in paragraph["qas"]:
                    qas_id = str(qa["id"])
                    question_text = qa["question"]

                    # Noisy question skipping
                    if len(question_text.split(' ')) == 1:
                        logger.info('Skipping a single word question: {}'.format(question_text))
                        continue
                    if "I couldn't could up with another question." in question_text:
                        logger.info('Skipping a strange question: {}'.format(question_text))
                        continue

                    start_position = None
                    end_position = None
                    orig_answer_text = None

                    # For pre-processing that should return answers together
                    if return_answers:
                        assert type(qa["answers"]) == dict or type(qa["answers"]) == list, type(qa["answers"])
                        if type(qa["answers"]) == dict:
                            qa["answers"] = [qa["answers"]]

                        # No answers
                        if len(qa["answers"]) == 0:
                            orig_answer_text = ""
                            start_position = -1 # Word-level no-answer => -1
                            end_position = -1
                            no_ans_cnt += 1
                        # Answer exists
                        else:
                            answer = qa["answers"][0]
                            ans_cnt += 1

                            orig_answer_text = answer["text"]
                            answer_offset = answer["answer_start"] + title_offset
                            answer_length = len(orig_answer_text)
                            start_position = char_to_word_offset[answer_offset]
                            end_position = char_to_word_offset[answer_offset + answer_length - 1]

                            # Only add answers where the text can be exactly recovered from the context
                            actual_text = " ".join(doc_words[start_position:(end_position + 1)])
                            cleaned_answer_text = " ".join(
                                tokenization.whitespace_tokenize(orig_answer_text)) # word based tokenization
                            if actual_text.find(cleaned_answer_text) == -1:
                                logger.warning("Could not find answer: '%s' vs. '%s'",
                                               actual_text, cleaned_answer_text)
                                continue

                    # Question only ends here
                    if question_only:
                        example = SquadExample(
                            qas_id=qas_id,
                            question_text=question_text)

                    # Context/question pair ends here
                    else:
                        example = SquadExample(
                            qas_id=qas_id,
                            question_text=question_text,
                            paragraph_text=paragraph_text,
                            doc_words=doc_words,
                            orig_answer_text=orig_answer_text,
                            start_position=start_position,
                            end_position=end_position,
                            title=title,
                            doc_idx=doc_idx,
                            par_idx=par_idx)
                    examples.append(example)

                    if draft and len(examples) == draft_num_examples:
                        return examples

    # Testing for shuffled draft (should comment out above 'draft' if-else statements)
    if draft:
        random.shuffle(examples)
        logger.info(str(len(examples)) + ' were collected before draft for shuffling')
        return examples[:draft_num_examples]

    logger.info('Answer/no-answer stat: %d vs %d'%(ans_cnt, no_ans_cnt))
    return examples
示例#9
0
def read_nq_entry(entry, is_training):
    """Converts a NQ entry into a list of NqExamples."""
    def is_whitespace(c):
        return c in " \t\r\n" or ord(c) == 0x202F

    examples = []
    contexts_id = entry["id"]
    contexts = entry["contexts"]
    doc_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True
    for c in contexts:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

    questions = []
    for i, question in enumerate(entry["questions"]):
        qas_id = "{}".format(contexts_id)
        question_text = question["input_text"]
        start_position = None
        end_position = None
        answer = None
        if is_training:
            answer_dict = entry["answers"][i]
            answer = make_nq_answer(contexts, answer_dict)

            # For now, only handle extractive, yes, and no.
            if answer is None or answer.offset is None:
                continue
            start_position = char_to_word_offset[answer.offset]
            end_position = char_to_word_offset[answer.offset +
                                               len(answer.text) - 1]

            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(doc_tokens[start_position:(end_position +
                                                              1)])
            cleaned_answer_text = " ".join(
                tokenization.whitespace_tokenize(answer.text))
            if actual_text.find(cleaned_answer_text) == -1:
                tf.compat.v1.logging.warning(
                    "Could not find answer: '%s' vs. '%s'", actual_text,
                    cleaned_answer_text)
                continue

        questions.append(question_text)
        example = NqExample(example_id=int(contexts_id),
                            qas_id=qas_id,
                            questions=questions[:],
                            doc_tokens=doc_tokens,
                            doc_tokens_map=entry.get("contexts_map", None),
                            answer=answer,
                            start_position=start_position,
                            end_position=end_position)
        examples.append(example)
    return examples
def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    #
    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case=squad_params.do_lower_case)
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            k = 0
            temp_word = ""
            for c in paragraph_text:
                if tokenization._is_whitespace(c):
                    char_to_word_offset.append(k - 1)
                    continue
                else:
                    temp_word += c
                    char_to_word_offset.append(k)
                if squad_params.do_lower_case:
                    temp_word = temp_word.lower()
                if temp_word == raw_doc_tokens[k]:
                    doc_tokens.append(temp_word)
                    temp_word = ""
                    k += 1

            assert k == len(raw_doc_tokens)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None

                if is_training:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]

                    if orig_answer_text not in paragraph_text:
                        tf.logging.warning("Could not find answer")
                    else:
                        answer_offset = paragraph_text.index(orig_answer_text)
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset + answer_length - 1]

                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if squad_params.do_lower_case:
                            cleaned_answer_text = cleaned_answer_text.lower()
                        if actual_text.find(cleaned_answer_text) == -1:
                            pdb.set_trace()
                            tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                            continue

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position)
                examples.append(example)
    tf.logging.info("**********read_squad_examples complete!**********")

    return examples
示例#11
0
    def _create_examples(self, data_split, dataset_type='v1.1'):
        input_file = os.path.join(
            self.data_dir, '{}-{}.json'.format(data_split, dataset_type))
        with open(input_file, "r") as reader:
            input_data = json.load(reader)["data"]

        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                    c) == 0x202F or ord(c) == 160:
                return True
            return False

        doc_count = 0
        examples = []
        for entry in input_data:
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                doc_tokens = []
                char_to_word_offset = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                    char_to_word_offset.append(len(doc_tokens) - 1)

                doc_count += 1
                doc_id = '{}-{}'.format(data_split, doc_count)

                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position = None
                    end_position = None
                    orig_answer_text = None
                    is_impossible = False
                    # note: use the first answer for all dataset splits.
                    # if len(qa["answers"]) != 1:
                    #     raise ValueError(
                    #         "For training, each question should have exactly 1 answer.")
                    if dataset_type == 'v2.0':
                        is_impossible = qa['is_impossible']
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        orig_answer_texts = [a['text'] for a in qa["answers"]]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            print("Could not find answer: '%s' vs. '%s'",
                                  actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ''
                        orig_answer_texts = []

                    example = InputAnswerExtractionExample(
                        did=doc_id,
                        qid=qas_id,
                        query_text=question_text,
                        doc_tokens=doc_tokens,
                        orig_answer_text=orig_answer_text,
                        start_position=start_position,
                        end_position=end_position,
                        orig_answer_texts=orig_answer_texts,
                        is_impossible=is_impossible)
                    examples.append(example)

        if self.candidate_filepath[data_split]:
            return mc_converter(self.candidate_filepath[data_split], examples)
        else:
            return examples
示例#12
0
文件: squad.py 项目: wuhuaquan/ernie
def read_squad_examples(input_file,
                        is_training,
                        version_2_with_negative=False):
    """Read a SQuAD json file into a list of SquadExample."""
    with io.open(input_file, "r", encoding="utf8") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            #            doc_tokens = []
            #            char_to_word_offset = []
            #            prev_is_whitespace = True
            #            for c in paragraph_text:
            #                if is_whitespace(c):
            #                    prev_is_whitespace = True
            #                else:
            #                    if prev_is_whitespace:
            #                        doc_tokens.append(c)
            #                    else:
            #                        doc_tokens[-1] += c
            #                    prev_is_whitespace = False
            #                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_pos = None
                end_pos = None
                orig_answer_text = None
                is_impossible = False
                if is_training:

                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        doc_tokens = [
                            paragraph_text[:answer_offset],
                            paragraph_text[answer_offset:answer_offset +
                                           answer_length],
                            paragraph_text[answer_offset + answer_length:]
                        ]
                        start_pos = 1
                        end_pos = 1
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        #actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                        actual_text = " ".join(doc_tokens[start_pos:(end_pos +
                                                                     1)])
                        cleaned_answer_text = " ".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            print("Could not find answer: '%s' vs. '%s'",
                                  actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_pos = -1
                        end_pos = -1
                        orig_answer_text = ""
                else:
                    doc_tokens = tokenization.tokenize_chinese_chars(
                        paragraph_text)

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_pos,
                                       end_position=end_pos,
                                       is_impossible=is_impossible)
                examples.append(example)

    return examples
示例#13
0
def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a generator of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:

                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            tf.logging.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text,
                                cleaned_answer_text,
                            )
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                yield SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=is_impossible,
                )
def estimate_runtime_examples(data_path, sample_rate, tokenizer, \
                              max_seq_length, doc_stride, max_query_length, \
                              remove_impossible_questions=True, filter_invalid_spans=True):
    """Count runtime examples which may differ from number of raw samples due to sliding window operation and etc.. This is useful to get correct warmup steps for training."""

    assert sample_rate > 0.0 and sample_rate <= 1.0, "sample_rate must be set between 0.0~1.0"

    print("loading data with json parser...")
    with open(data_path, "r") as reader:
        data = json.load(reader)["data"]

    num_raw_examples = 0
    for entry in data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            for qa in paragraph["qas"]:
                num_raw_examples += 1
    print("num raw examples:{}".format(num_raw_examples))

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    sampled_examples = []
    for entry in data:
        for paragraph in entry["paragraphs"]:
            doc_tokens = None
            for qa in paragraph["qas"]:
                if random.random() > sample_rate and sample_rate < 1.0:
                    continue

                if doc_tokens is None:
                    paragraph_text = paragraph["context"]
                    doc_tokens = []
                    char_to_word_offset = []
                    prev_is_whitespace = True
                    for c in paragraph_text:
                        if is_whitespace(c):
                            prev_is_whitespace = True
                        else:
                            if prev_is_whitespace:
                                doc_tokens.append(c)
                            else:
                                doc_tokens[-1] += c
                            prev_is_whitespace = False
                        char_to_word_offset.append(len(doc_tokens) - 1)

                assert len(
                    qa["answers"]
                ) == 1, "For training, each question should have exactly 1 answer."

                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False

                if ('is_impossible' in qa) and (qa["is_impossible"]):
                    if remove_impossible_questions or filter_invalid_spans:
                        continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""
                        is_impossible = True
                else:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]

                    # remove corrupt samples
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        tokenization.whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        print("Could not find answer: '%s' vs. '%s'",
                              actual_text, cleaned_answer_text)
                        continue

                example = MRQAExample(qas_id=qas_id,
                                      question_text=question_text,
                                      doc_tokens=doc_tokens,
                                      orig_answer_text=orig_answer_text,
                                      start_position=start_position,
                                      end_position=end_position,
                                      is_impossible=is_impossible)
                sampled_examples.append(example)

    runtime_sample_rate = len(sampled_examples) / float(num_raw_examples)
    # print("DEBUG-> runtime sampled examples: {}, sample rate: {}.".format(len(sampled_examples), runtime_sample_rate))

    runtime_samp_cnt = 0

    for example in sampled_examples:
        query_tokens = tokenizer.tokenize(example.question_text)

        if len(query_tokens) > max_query_length:
            query_tokens = query_tokens[0:max_query_length]

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        tok_start_position = None
        tok_end_position = None

        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1
        (tok_start_position, tok_end_position) = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
            example.orig_answer_text)

        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3

        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):
                break
            start_offset += min(length, doc_stride)

        for (doc_span_index, doc_span) in enumerate(doc_spans):
            doc_start = doc_span.start
            doc_end = doc_span.start + doc_span.length - 1
            if filter_invalid_spans and not (tok_start_position >= doc_start
                                             and tok_end_position <= doc_end):
                continue
            runtime_samp_cnt += 1
    return int(runtime_samp_cnt / runtime_sample_rate)
示例#15
0
def read_quac_examples(input_file, is_training):
    """Read a QuAC json file into a list of CQAExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    if FLAGS.load_small_portion:
        input_data = input_data[:10]
        # print('input_data:', input_data)
        tf.logging.warning('<<<<<<<<<< load_small_portion is on! >>>>>>>>>>')
    for entry in input_data:
        # An additional "CANNOTANSWER" has been added in QuAC data, so no need to append one.
        entry = entry['paragraphs'][0]
        paragraph_text = entry["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)
            
        ############################################################
        # convert the convasational QAs to squad format, with history
        ############################################################

        questions = [(item['question'], item['id']) for item in entry['qas']] # [(question, question_id), ()]
        answers = [(item['orig_answer']['text'], item['orig_answer']['answer_start']) for item in entry['qas']]
        followups = [item['followup'] for item in entry['qas']]
        yesnos = [item['yesno'] for item in entry['qas']]

        qas = []
        for i, (question, answer, followup, yesno) in enumerate(zip(questions, answers, followups, yesnos)):
            metadata = {'turn': i + 1, 'history_turns': [], 'tok_history_answer_markers':[], 
                        'followup': followup, 'yesno': yesno, 'history_turns_text': []}
            # if FLAGS.use_RL:
            #     start_index = 0
            # else:
            #     start_index = 0 if i - int(FLAGS.history) < 0 else i - int(FLAGS.history)
            
            end_index = i
            question_with_histories = ''
            
            history_answer_marker = None
            if FLAGS.use_history_answer_marker:
                start_index = 0 # we read all the histories no matter we use RL or not. we will make approporiate selections afterwards
                history_answer_marker = []
                for history_turn, (each_answer, each_question) in enumerate(
                    zip(answers[start_index: end_index], questions[start_index: end_index])):
                    
                    # [history_answer_start, history_answer_end, history_answer_text]
                    each_marker = [each_answer[1], each_answer[1] + len(each_answer[0]), each_answer[0]]
                    history_answer_marker.append(each_marker)
                    metadata['history_turns'].append(history_turn + start_index + 1)
                    metadata['history_turns_text'].append((each_question[0], each_answer[0])) #[(q1, a1), (q2, a2), ...]
            else:
                # prepend historical questions and answers
                start_index = max(end_index - FLAGS.history, 0)
                if FLAGS.only_history_answer:
                    for each_answer in answers[start_index: end_index]:
                        question_with_histories += each_answer[0] + ' '
                else:
                    for each_question, each_answer in zip(questions[start_index: end_index], answers[start_index: end_index]):
                        question_with_histories += each_question[0] + ' ' + each_answer[0] + ' '
            # add the current question
            question_with_histories += question[0]
            qas.append({'id': question[1], 'question': question_with_histories, 'answers': [{'answer_start': answer[1], 'text': answer[0]}],
                        'history_answer_marker': history_answer_marker, 'metadata': metadata})

        for qa in qas:
            qas_id = qa["id"]
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None
            
            # if is_training:
            # we read in the groundtruth answer bothing druing training and predicting, because we need to compute acc and f1 at predicting time.
            if len(qa["answers"]) != 1:
                raise ValueError(
                    "For training, each question should have exactly 1 answer.")
            answer = qa["answers"][0]
            orig_answer_text = answer["text"]
            answer_offset = answer["answer_start"]
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length - 1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(
                tokenization.whitespace_tokenize(orig_answer_text))
            
            if is_training and actual_text.find(cleaned_answer_text) == -1:
                tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                                   actual_text, cleaned_answer_text)
                continue
                
            # we construct a tok_history_answer_marker to store the aggregated history answer markers for a question.
            # we also construct each_tok_history_answer_marker to store a single history answer marker.
            tok_history_answer_marker = [0] * len(doc_tokens)
            if FLAGS.use_history_answer_marker:
                for marker_index, marker in enumerate(qa['history_answer_marker']):
                    each_tok_history_answer_marker = [0] * len(doc_tokens)
                    history_orig_answer_text = marker[2]
                    history_answer_offset = marker[0]
                    history_answer_length = len(history_orig_answer_text)
                    history_start_position = char_to_word_offset[history_answer_offset]
                    history_end_position = char_to_word_offset[history_answer_offset + history_answer_length - 1]
                    history_actual_text = " ".join(doc_tokens[history_start_position:(history_end_position + 1)])
                    history_cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(history_orig_answer_text))
                    if history_actual_text.find(history_cleaned_answer_text) != -1:
                        tok_history_answer_marker = tok_history_answer_marker[: history_start_position] + \
                                            [1] * (history_end_position - history_start_position + 1) + \
                                            tok_history_answer_marker[history_end_position + 1 :]
                        each_tok_history_answer_marker = each_tok_history_answer_marker[: history_start_position] + \
                                            [1] * (history_end_position - history_start_position + 1) + \
                                            each_tok_history_answer_marker[history_end_position + 1 :]
                        assert len(tok_history_answer_marker) == len(doc_tokens)
                        assert len(each_tok_history_answer_marker) == len(doc_tokens)
                        qa['metadata']['tok_history_answer_markers'].append(each_tok_history_answer_marker)
                    else:
                        tf.logging.warning("Could not find history answer: '%s' vs. '%s'", history_actual_text, history_cleaned_answer_text)                                    

            example = CQAExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                history_answer_marker=tok_history_answer_marker,
                metadata=qa['metadata'])
            examples.append(example)
            # print(example)
    return examples
示例#16
0
def read_squad_examples(input_file, is_training, do_lower_case):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            raw_doc_tokens = customize_tokenizer(paragraph_text,
                                                 do_lower_case=do_lower_case)
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            k = 0
            temp_word = ""
            for c in paragraph_text:
                if tokenization._is_whitespace(c):
                    char_to_word_offset.append(k - 1)
                    continue
                else:
                    temp_word += c
                    char_to_word_offset.append(k)
                if do_lower_case is True:
                    temp_word = temp_word.lower()
                if temp_word == raw_doc_tokens[k]:
                    doc_tokens.append(temp_word)
                    temp_word = ""
                    k += 1

            try:
                assert k == len(raw_doc_tokens)
            except AssertionError:
                print(len(raw_doc_tokens), len(doc_tokens))
                for i in range(min(len(doc_tokens), len(raw_doc_tokens))):
                    if raw_doc_tokens[i] != doc_tokens[i]:
                        print(raw_doc_tokens[i - 3:i + 3],
                              doc_tokens[i - 3:i + 3])
                        break
                print(''.join(doc_tokens[500:]))
                print("----")
                print(''.join(raw_doc_tokens[500:]))
                raise AssertionError

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                is_impossible = False
                start_position = None
                end_position = None
                orig_answer_text = None
                if is_training:
                    is_impossible = len(qa['answers']) == 0
                    if len(qa["answers"]) > 1:
                        pass
                        #raise ValueError(
                        #    "For training, each question should have less than 1 answer.")
                    if len(qa['answers']) == 0:
                        orig_answer_text = ""
                        start_position = end_position = 0  # use_cls
                    else:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        if orig_answer_text not in paragraph_text:
                            logger.warning("Could not find answer")
                            continue
                        answer_offset = paragraph_text.index(orig_answer_text)
                        #answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if do_lower_case:
                            cleaned_answer_text = cleaned_answer_text.lower()
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position,
                                       is_impossible=is_impossible)
                examples.append(example)
    return examples
示例#17
0
文件: run_squad.py 项目: Wanke15/bert
def read_squad_examples(input_file, is_training):
  """Read a SQuAD json file into a list of SquadExample."""
  with tf.gfile.Open(input_file, "r") as reader:
    input_data = json.load(reader)["data"]

  def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
      return True
    return False

  examples = []
  for entry in input_data:
    for paragraph in entry["paragraphs"]:
      paragraph_text = paragraph["context"]
      doc_tokens = []
      char_to_word_offset = []
      prev_is_whitespace = True
      for c in paragraph_text:
        if is_whitespace(c):
          prev_is_whitespace = True
        else:
          if prev_is_whitespace:
            doc_tokens.append(c)
          else:
            doc_tokens[-1] += c
          prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

      for qa in paragraph["qas"]:
        qas_id = qa["id"]
        question_text = qa["question"]
        start_position = None
        end_position = None
        orig_answer_text = None
        is_impossible = False
        if is_training:

          if FLAGS.version_2_with_negative:
            is_impossible = qa["is_impossible"]
          if (len(qa["answers"]) != 1) and (not is_impossible):
            raise ValueError(
                "For training, each question should have exactly 1 answer.")
          if not is_impossible:
            answer = qa["answers"][0]
            orig_answer_text = answer["text"]
            answer_offset = answer["answer_start"]
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length -
                                               1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(
                doc_tokens[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(
                tokenization.whitespace_tokenize(orig_answer_text))
            if actual_text.find(cleaned_answer_text) == -1:
              tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                                 actual_text, cleaned_answer_text)
              continue
          else:
            start_position = -1
            end_position = -1
            orig_answer_text = ""

        example = SquadExample(
            qas_id=qas_id,
            question_text=question_text,
            doc_tokens=doc_tokens,
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position,
            is_impossible=is_impossible)
        examples.append(example)

  return examples
def read_classifier_examples(input_file, labels, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                #if (len(qa["answers"]) != 1):

                if not is_impossible:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        tokenization.whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        #tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                        #actual_text, cleaned_answer_text)
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""
                label = labels[qas_id]
                example = run_classifier.InputExample(guid=qas_id,
                                                      text_a=question_text,
                                                      text_b=orig_answer_text,
                                                      label=label)
                examples.append(example)
    return examples
示例#19
0
    def read_squad_examples(input_file, is_training):
        """Read a SQuAD json file into a list of SquadExample."""
        #  with tf.gfile.Open(input_file, "r") as reader:
        #    input_data = json.load(reader)["data"]
 
        file_names = os.listdir(input_file)
        file_names = [a for a in file_names if a.endswith('.json')]
        dataset = []
        for file_name in file_names:
          data_file = os.path.join(input_file, file_name)
          with tf.io.gfile.GFile(data_file,'r') as f:
              dataset_json = json.load(f)
              dataset.extend(dataset_json['data'])
 
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
              return True
            return False
 
        examples = []
        for entry in dataset:
            paragraph_text = entry["context"]
            doc_tokens = [] # c
            # context에서 white space 기준으로 토큰화
            char_to_word_offset = [] # 한 토큰 당 character 수 저장
            prev_is_whitespace = True
            for c in paragraph_text:  # context 한 글자씩 for loop
              if is_whitespace(c):
                prev_is_whitespace = True
              else:
                if prev_is_whitespace: # 이전 글자가 white space 이면
                  doc_tokens.append(c) # doc_tokens에 append
                else:
                  doc_tokens[-1] += c # 마지막에 append
                prev_is_whitespace = False
              char_to_word_offset.append(len(doc_tokens) - 1)
 
            for qa in entry["qas"]:
              qas_id = qa["id"]
              question_text = qa["question"]
              start_position = None
              end_position = None
              orig_answer_text = None
              if is_training:
                answer = qa["answer"]
                orig_answer_text = answer["text"]
                answer_offset = answer["answer_start"]
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset + answer_length - 1]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                   # guaranteed to be preserved.
                actual_text = " ".join(
                doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                tokenization.whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                  tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                                         actual_text, cleaned_answer_text)
                  continue
                else:
                  start_position = -1
                  end_position = -1
                  orig_answer_text = ""
 
              example = SquadExample(
                  qas_id=qas_id,
                  question_text=question_text,
                  doc_tokens=doc_tokens,
                  orig_answer_text=orig_answer_text,
                  start_position=start_position,
                  end_position=end_position)
              examples.append(example)
 
        return examples
def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training):
    features = []
    if is_training and not example.is_impossible:
        # Get start and end position
        start_position = example.start_position
        end_position = example.end_position

        # If the answer cannot be found in the text, then skip this example.
        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    if is_training and not example.is_impossible:
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1

        (tok_start_position, tok_end_position) = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
        )

    spans = []

    truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
    sequence_added_tokens = (
        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
        if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
        else tokenizer.max_len - tokenizer.max_len_single_sentence
    )
    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair

    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):

        encoded_dict = tokenizer.encode_plus(
            truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
            span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            pad_to_max_length=True,
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
            truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
            return_token_type_ids=True,
        )

        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )

        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            if tokenizer.padding_side == "right":
                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
            else:
                last_padding_id_position = (
                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
                )
                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]

        else:
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        for i in range(paragraph_len):
            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]

        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len

        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict:
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
            index = (
                j
                if tokenizer.padding_side == "left"
                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
            )
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # Original TF implem also keep the classification token (set to 0)
        p_mask = np.ones_like(span["token_type_ids"])
        if tokenizer.padding_side == "right":
            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
        else:
            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0

        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
        ).nonzero()

        p_mask[pad_token_indices] = 1
        p_mask[special_token_indices] = 1

        # Set the cls index to 0: the CLS index can be used for impossible answers
        p_mask[cls_index] = 0

        span_is_impossible = example.is_impossible
        start_position = 0
        end_position = 0
        if is_training and not span_is_impossible:
            # For training, if our document chunk does not contain an annotation
            # we throw it out, since there is nothing to predict.
            doc_start = span["start"]
            doc_end = span["start"] + span["length"] - 1
            out_of_span = False

            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
                out_of_span = True

            if out_of_span:
                start_position = cls_index
                end_position = cls_index
                span_is_impossible = True
            else:
                if tokenizer.padding_side == "left":
                    doc_offset = 0
                else:
                    doc_offset = len(truncated_query) + sequence_added_tokens

                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

        features.append(
            SquadFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_position=start_position,
                end_position=end_position,
                is_impossible=span_is_impossible,
                qas_id=example.qas_id,
            )
        )
    return features
def read_squad_examples(input_data, tokenizer):
    """
    https://github.com/eva-n27/BERT-for-Chinese-Question-Answering/blob/master/run_squad.py
    Read a SQuAD json file into a list of SquadExample.
    这个函数将input_data[i]["paragraphs"]["context"]变成一个list,词的list
    然后遍历"qas",对于每一个qa,提取
    {
        qas_id: qa['id'],
        question_text: qa["question"],
        orig_answer_text: answer["text"],
        start_position: start_position,
        end_position: end_position
    }
    """
    import unicodedata

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    def is_control(char):
        """Checks whether `chars` is a control character."""
        # These are technically control characters but we count them as whitespace
        # characters.
        if char == "\t" or char == "\n" or char == "\r":
            return False
        cat = unicodedata.category(char)
        if cat.startswith("C"):
            return True
        return False

    def clean_text(text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or is_control(char):
                continue
            if is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    examples = []
    tf.logging.info("*** reading squad examples ***")
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = " ".join(
                tokenization.whitespace_tokenize(
                    clean_text(paragraph["context"])))

            for qa in paragraph["qas"]:
                doc_tokens = tokenizer.basic_tokenizer.tokenize(paragraph_text)
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position)
                examples.append(example)
    return examples
示例#22
0
def read_squad_examples(input_file, tokenizer, is_training):
    """
    Read a SQuAD json file into a list of SquadExample.
    这个函数将input_data[i]["paragraphs"]["context"]变成一个list,词的list
    然后遍历"qas",对于每一个qa,提取
    {
        qas_id: qa['id'],
        question_text: qa["question"],
        orig_answer_text: answer["text"],
        start_position: start_position,
        end_position: end_position
    }
    """
    import unicodedata
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    def is_control(char):
        """Checks whether `chars` is a control character."""
        # These are technically control characters but we count them as whitespace
        # characters.
        if char == "\t" or char == "\n" or char == "\r":
            return False
        cat = unicodedata.category(char)
        if cat.startswith("C"):
            return True
        return False

    def clean_text(text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or is_control(char):
                continue
            if is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    examples = []
    for entry in tqdm(input_data):
        for paragraph in entry["paragraphs"]:
            paragraph_text = " ".join(
                tokenization.whitespace_tokenize(
                    clean_text(paragraph["context"])))

            for qa in paragraph["qas"]:
                doc_tokens = tokenizer.basic_tokenizer.tokenize(paragraph_text)
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                if is_training:
                    if len(qa["answers"]) != 1:
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    if len(orig_answer_text) == 0:
                        continue
                    cleaned_answer_text = "".join(
                        tokenizer.basic_tokenizer.tokenize(orig_answer_text))
                    ori_start_position = "".join(doc_tokens).find(
                        cleaned_answer_text)
                    if ori_start_position == -1:
                        print("Could not find answer: '%s' vs. '%s'",
                              ''.join(doc_tokens), cleaned_answer_text)
                        continue
                    ori_end_position = ori_start_position + len(
                        cleaned_answer_text) - 1
                    char_to_word_offset = {}
                    start = 0
                    for idx, token in enumerate(doc_tokens):
                        for _ in token:
                            char_to_word_offset[start] = idx
                            start += 1
                    start_position = char_to_word_offset[ori_start_position]
                    try:
                        end_position = char_to_word_offset[ori_end_position]
                    except KeyError:
                        continue
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = "".join(
                        doc_tokens[start_position:(end_position + 1)])
                    if actual_text != cleaned_answer_text:
                        # print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                        continue
                    orig_answer_text = cleaned_answer_text

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position)
                examples.append(example)
    return examples
示例#23
0
def read_squad_examples(input_file,
                        is_training,
                        context_only=False,
                        question_only=False,
                        draft=False,
                        draft_num_examples=12,
                        tokenizer=None):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    examples = []
    for doc_idx, entry in enumerate(input_data):
        title = entry['title']
        for pid, paragraph in enumerate(entry["paragraphs"]):
            if not question_only:
                paragraph_text = paragraph["context"]
                doc_tokens, char_to_word_offset = context_to_tokens_and_offset(
                    paragraph_text, tokenizer=tokenizer)
            if context_only:
                example = SquadExample(doc_tokens=doc_tokens,
                                       title=title,
                                       doc_idx=doc_idx,
                                       pid=pid)
                examples.append(example)
                if draft and len(examples) == draft_num_examples:
                    return examples
                continue
            else:
                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position = None
                    end_position = None
                    orig_answer_text = None
                    if is_training:
                        if False:  # len(qa["answers"]) > 1:
                            raise ValueError(
                                "For training, each question should have exactly 1 answer."
                            )
                        elif len(qa["answers"]) == 0:
                            orig_answer_text = ""
                            start_position = -1
                            end_position = -1
                        else:
                            answer = qa["answers"][0]
                            orig_answer_text = answer["text"]
                            answer_offset = answer["answer_start"]
                            answer_length = len(orig_answer_text)
                            start_position = char_to_word_offset[answer_offset]
                            end_position = char_to_word_offset[answer_offset +
                                                               answer_length -
                                                               1]
                            # Only add answers where the text can be exactly recovered from the
                            # document. If this CAN'T happen it's likely due to weird Unicode
                            # stuff so we will just skip the example.
                            #
                            # Note that this means for training mode, every example is NOT
                            # guaranteed to be preserved.
                            actual_text = " ".join(
                                doc_tokens[start_position:(end_position + 1)])
                            cleaned_answer_text = " ".join(
                                tokenization.whitespace_tokenize(
                                    orig_answer_text))
                            if actual_text.find(cleaned_answer_text) == -1:
                                logger.warning(
                                    "Could not find answer: '%s' vs. '%s'",
                                    actual_text, cleaned_answer_text)
                                continue

                    if question_only:
                        example = SquadExample(qas_id=qas_id,
                                               question_text=question_text)
                    else:
                        example = SquadExample(
                            qas_id=qas_id,
                            question_text=question_text,
                            doc_tokens=doc_tokens,
                            orig_answer_text=orig_answer_text,
                            start_position=start_position,
                            end_position=end_position,
                            title=title,
                            pid=pid)
                    examples.append(example)

                    if draft and len(examples) == draft_num_examples:
                        return examples
    return examples
示例#24
0
def read_doqa_examples(input_file, is_training):
    """Read a DoQA json file into a list of DOQAExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    if FLAGS.load_small_portion:
        input_data = input_data[:10]
        # print('input_data:', input_data)
        tf.logging.warning('<<<<<<<<<< load_small_portion is on! >>>>>>>>>>')
    for entry in input_data:
        # An additional "CANNOTANSWER" has been added in DoQA data, so no need to append one.
        entry = entry['paragraphs'][0]
        paragraph_text = entry["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        ############################################################
        # convert the convasational QAs to squad format, with history
        ############################################################

        questions = [(item['question'], item['id'])
                     for item in entry['qas']]  # [(question, question_id), ()]
        answers = [(item['orig_answer']['text'],
                    item['orig_answer']['answer_start'])
                   for item in entry['qas']]
        followups = [item['followup'] for item in entry['qas']]
        yesnos = [item['yesno'] for item in entry['qas']]

        qas = []
        for i, (question, answer, followup,
                yesno) in enumerate(zip(questions, answers, followups,
                                        yesnos)):
            metadata = {
                'turn': i + 1,
                'history_turns': [],
                'tok_history_answer_markers': [],
                'followup': followup,
                'yesno': yesno,
                'history_turns_text': []
            }
            # if FLAGS.use_RL:
            #     start_index = 0
            # else:
            #     start_index = 0 if i - int(FLAGS.history) < 0 else i - int(FLAGS.history)

            end_index = i
            question_with_histories = ''

            history_answer_marker = None

            start_index = 0  # we read all the histories no matter we use RL or not. we will make approporiate selections afterwards
            history_answer_marker = []
            for history_turn, (each_answer, each_question) in enumerate(
                    zip(answers[start_index:end_index],
                        questions[start_index:end_index])):

                # [history_answer_start, history_answer_end, history_answer_text]
                each_marker = [
                    each_answer[1], each_answer[1] + len(each_answer[0]),
                    each_answer[0]
                ]
                history_answer_marker.append(each_marker)
                metadata['history_turns'].append(history_turn + start_index +
                                                 1)
                metadata['history_turns_text'].append(
                    (each_question[0],
                     each_answer[0]))  #[(q1, a1), (q2, a2), ...]

            # add the current question
            question_with_histories += question[0]
            qas.append({
                'id':
                question[1],
                'question':
                question_with_histories,
                'answers': [{
                    'answer_start': answer[1],
                    'text': answer[0]
                }],
                'history_answer_marker':
                history_answer_marker,
                'metadata':
                metadata
            })

        for qa in qas:
            qas_id = qa["id"]
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None

            # if is_training:
            # we read in the groundtruth answer bothing druing training and predicting, because we need to compute acc and f1 at predicting time.
            if len(qa["answers"]) != 1:
                raise ValueError(
                    "For training, each question should have exactly 1 answer."
                )
            answer = qa["answers"][0]
            orig_answer_text = answer["text"]
            answer_offset = answer["answer_start"]
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length -
                                               1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(doc_tokens[start_position:(end_position +
                                                              1)])
            cleaned_answer_text = " ".join(
                tokenization.whitespace_tokenize(orig_answer_text))

            if is_training and actual_text.find(cleaned_answer_text) == -1:
                tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                                   actual_text, cleaned_answer_text)
                continue

            # we construct a tok_history_answer_marker to store the aggregated history answer markers for a question.
            # we also construct each_tok_history_answer_marker to store a single history answer marker.
            tok_history_answer_marker = [0] * len(doc_tokens)

            for marker_index, marker in enumerate(qa['history_answer_marker']):
                each_tok_history_answer_marker = [0] * len(doc_tokens)
                history_orig_answer_text = marker[2]
                history_answer_offset = marker[0]
                history_answer_length = len(history_orig_answer_text)
                history_start_position = char_to_word_offset[
                    history_answer_offset]
                history_end_position = char_to_word_offset[
                    history_answer_offset + history_answer_length - 1]
                history_actual_text = " ".join(
                    doc_tokens[history_start_position:(history_end_position +
                                                       1)])
                history_cleaned_answer_text = " ".join(
                    tokenization.whitespace_tokenize(history_orig_answer_text))
                if history_actual_text.find(history_cleaned_answer_text) != -1:
                    tok_history_answer_marker = tok_history_answer_marker[: history_start_position] + \
                                        [1] * (history_end_position - history_start_position + 1) + \
                                        tok_history_answer_marker[history_end_position + 1 :]
                    each_tok_history_answer_marker = each_tok_history_answer_marker[: history_start_position] + \
                                        [1] * (history_end_position - history_start_position + 1) + \
                                        each_tok_history_answer_marker[history_end_position + 1 :]
                    assert len(tok_history_answer_marker) == len(doc_tokens)
                    assert len(each_tok_history_answer_marker) == len(
                        doc_tokens)
                    qa['metadata']['tok_history_answer_markers'].append(
                        each_tok_history_answer_marker)
                else:
                    tf.logging.warning(
                        "Could not find history answer: '%s' vs. '%s'",
                        history_actual_text, history_cleaned_answer_text)

            example = DOQAExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                history_answer_marker=tok_history_answer_marker,
                metadata=qa['metadata'])
            examples.append(example)
            # print(example)
    return examples
示例#25
0
def read_docvqa_examples(input_file, is_training, skip_match_answers=True):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    count_match = 0
    count_nomatch = 0

    examples = []
    for paragraph in input_data:
        image_id = paragraph["image_id"]
        paragraph_text = paragraph["context"]
        boxes = paragraph["boxes"]
        doc_tokens = paragraph["context"]
        for qa in paragraph["qas"]:
            if not qa["answer"]:
                continue
            qas_id = qa["qid"]
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            answer = qa["answer"][0]
            orig_answer_text = answer["text"]
            if is_training:
                if not is_impossible:
                    answer = qa["answer"][0]
                    orig_answer_text = answer["text"]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    start_position = qa["answer"][0]["answer_start"]
                    end_position = qa["answer"][0]["answer_end"]
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        tokenization.whitespace_tokenize(orig_answer_text))
                    if not skip_match_answers:
                        if actual_text.find(cleaned_answer_text) == -1:
                            tf.logging.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            count_nomatch += 1
                            continue
                    count_match += 1
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""

            example = DocvqaExample(qas_id=qas_id,
                                    question_text=question_text,
                                    doc_tokens=doc_tokens,
                                    orig_answer_text=orig_answer_text,
                                    start_position=start_position,
                                    end_position=end_position,
                                    is_impossible=is_impossible,
                                    boxes=boxes)
            examples.append(example)
    return examples
示例#26
0
def read_kg_examples(input_file, is_training):
  """Read a knowledge graph json file into a list of KGCExample."""
  with tf.gfile.Open(input_file, "r") as reader:
    input_data = json.load(reader)["data"]#[Memo]ちょっとわからないけどデータを抽出してる

  def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
      return True
    return False

  examples = []
  for entry in input_data:
    for data in entry["data"]:

      #文字列を単語の列に変換する
      nlr_text = data["nlr"]
      kgr = []
      char_to_word_offset = []
      prev_is_whitespace = True
      for c in nlr_text:
        if is_whitespace(c):
          prev_is_whitespace = True
        else:
          if prev_is_whitespace:
            kgr.append(c)
          else:
            kgr[-1] += c
          prev_is_whitespace = False
        char_to_word_offset.append(len(kgr) - 1)

      #入力データをKGCExampleクラスのインスタンスに変換する
      for qa in paragraph["qas"]:
        kg_id = qa["id"]
        nlr = qa["question"]
        start_position = None
        end_position = None
        orig_answer_text = None
        is_impossible = False
        if is_training:

          if FLAGS.version_2_with_negative:#SQuAD 2.0でないかどうか
            is_impossible = qa["is_impossible"]#SQuAD 1.1のときはimpossibleなときがある。たぶんそれはQに対するAがないときとかかな
          if (len(qa["answers"]) != 1) and (not is_impossible):
            raise ValueError(
                "For training, each question should have exactly 1 answer.")
          if not is_impossible:
            answer = qa["answers"][0]
            orig_answer_text = answer["text"]
            answer_offset = answer["answer_start"]
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length -
                                               1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(
                kgr[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(
                tokenization.whitespace_tokenize(orig_answer_text))
            if actual_text.find(cleaned_answer_text) == -1:
              tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                                 actual_text, cleaned_answer_text)
              continue
          else:
            start_position = -1
            end_position = -1
            orig_answer_text = ""

        #ここまでで得たデータをKGCExampleクラスのインスタンスに変換する
        example = KGCExample(
            kg_id=kg_id,
            nlr=nlr,
            kgr=kgr,
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position,
            is_impossible=is_impossible)
        examples.append(example)

  return examples