Python split_sentence示例，general_util.utils.split_sentence Python示例

示例#1

0

显示文件

文件： reader_w_evidence.py 项目： xhsun1997/Self-Training-MRC

    def read(self, input_file, sentence_id_file: str = None) -> List[QAFullExample]:
        logger.info(f'Reading data from {input_file}')

        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F:
                return True
            return False

        if sentence_id_file is not None:
            with open(sentence_id_file, 'r') as f:
                sentence_ids = json.load(f)
        else:
            sentence_ids = None

        examples = []
        for instance in data:
            article = instance['article']
            data_id = instance['id']
            question = instance['question']
            answer = instance['answer']

            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in article:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            sentence_start_list, sentence_end_list = utils.split_sentence(article, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            sentence_id = instance['id'] if sentence_ids is None else sentence_ids[data_id]

            example = QAFullExample(
                qas_id=data_id,
                question_text=question,
                doc_tokens=doc_tokens,
                sentence_span_list=sentence_span_list,
                sentence_id=sentence_id,
                is_impossible=answer
            )
            examples.append(example)

        return examples

示例#2

0

显示文件

文件： race_reader.py 项目： hzyang95/c3-master

    def read(self, input_file):
        """Read a SQuAD json file into a list of SquadExample."""
        logger.info('Reading data set from {}...'.format(input_file))
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F:
                return True
            return False

        examples = []
        for instance in tqdm(input_data):
            passage = instance['article']
            article_id = instance['id']

            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in passage:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(passage, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            questions = instance['questions']
            answers = list(map(lambda x: {'A': 0, 'B': 1, 'C': 2, 'D': 3}[x], instance['answers']))
            options = instance['options']

            for q_id, (question, answer, option_list) in enumerate(zip(questions, answers, options)):
                qas_id = f"{article_id}--{q_id}"
                example = MultiChoiceFullExample(
                    qas_id=qas_id,
                    question_text=question,
                    options=option_list,
                    doc_tokens=doc_tokens,
                    sentence_span_list=sentence_span_list,
                    answer=answer
                )
                examples.append(example)

        logger.info('Finish reading {} examples from {}'.format(len(examples), input_file))
        return examples

示例#3

0

显示文件

文件： squad_reader.py 项目： xhsun1997/Self-Training-MRC

    def read(self, input_file):
        """
        :param input_file: input file to load data. The format is in CoQA style
        """
        logger.info('Reading data set from {}...'.format(input_file))
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)['data']

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(
                    ch) == 0x202F:
                return True
            return False

        examples = []
        for entry in input_data:
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                doc_tokens = []
                char_to_word_offset = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                    char_to_word_offset.append(len(doc_tokens) - 1)

                # Split context into sentences
                sentence_start_list, sentence_end_list = utils.split_sentence(
                    paragraph_text, self.sentence_tokenizer)
                sentence_span_list = []
                for c_start, c_end in zip(sentence_start_list,
                                          sentence_end_list):
                    t_start = char_to_word_offset[c_start]
                    t_end = char_to_word_offset[c_end]
                    sentence_span_list.append((t_start, t_end))

                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]

                    sentence_id = utils.find_evidence_sentence(
                        sentence_span_list, start_position, end_position)

                    example = SQuADFullExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        sentence_span_list=sentence_span_list,
                        orig_answer_text="",
                        start_position=None,
                        end_position=None,
                        sentence_id=sentence_id,
                        is_impossible=-1,
                        ral_start_position=start_position,
                        ral_end_position=end_position)
                    examples.append(example)

        return examples

示例#4

0

显示文件

    def read(self, input_file, dialog_turns: int = 2) -> List[QAFullExample]:
        """
        :param input_file: input file to load data. The format is in CoQA style
        :param read_state: If read extra sentences from CoQA dataset.
        :param dialog_turns:
        """
        logger.info('Reading data set from {}...'.format(input_file))
        logger.info('Read parameters:')
        logger.info('Dialog turns: {}'.format(dialog_turns))
        # logger.info('Read state: {}'.format(read_state))
        # assert read_state in ReadState
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)['data']

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(
                    ch) == 0x202F:
                return True
            return False

        examples = []
        for paragraph in input_data:
            paragraph_text = paragraph["story"]
            story_id = paragraph['id']
            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(
                paragraph_text, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            doc_sentence_tokens = [
                doc_tokens[span[0]:(span[1] + 1)]
                for span in sentence_span_list
            ]

            questions = paragraph['questions']
            answers = paragraph['answers']
            for i, (question, answer) in enumerate(zip(questions, answers)):
                question_text = question['input_text']

                # We are only concerned about questions with Yes/No as answers
                answer_type = utils.normalize_answer(answer['input_text'])
                if answer_type not in ['yes', 'no']:
                    continue
                if answer_type == 'yes':
                    answer_choice = 0
                else:
                    answer_choice = 1

                for j in range(dialog_turns):
                    pre_idx = i - (j + 1)
                    if pre_idx >= 0:
                        question_text = questions[pre_idx][
                            'input_text'] + '<Q>' + answers[pre_idx][
                                'input_text'] + '<A>' + question_text

                qas_id = story_id + '--' + str(i + 1)

                # Add rationale start and end as extra supervised label.
                rationale_start_position = char_to_word_offset[
                    answer['span_start']]
                rationale_end_position = char_to_word_offset[answer['span_end']
                                                             - 1]

                sentence_id = utils.find_evidence_sentence(
                    sentence_span_list, rationale_start_position,
                    rationale_end_position)

                example = QAFullExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_sentence_tokens,
                    sentence_span_list=sentence_span_list,
                    orig_answer_text="",
                    start_position=None,
                    end_position=None,
                    sentence_id=sentence_id,
                    is_impossible=answer_choice,
                    ral_start_position=rationale_start_position,
                    ral_end_position=rationale_end_position)
                examples.append(example)
        return examples

示例#5

0

显示文件

文件： msmarco_cb_reader.py 项目： xhsun1997/Self-Training-MRC

    def read(self, input_file):
        """Read a SQuAD json file into a list of SquadExample."""
        logger.info('Reading data set from {}...'.format(input_file))
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F:
                return True
            return False

        examples = []

        for articles, qas_id, question, yes_no in tqdm(
                zip(input_data['passages'], input_data['ids'], input_data['questions'], input_data['yes_no'])):

            # Read all passages.
            passage = ''
            for doc in articles:
                passage = passage + doc['text']

            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in passage:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(passage, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            if yes_no == 'yes':
                is_impossible = 0
            elif yes_no == 'no':
                is_impossible = 1
            else:
                raise RuntimeError(f'Wrong yes_no type : {yes_no}')

            example = SQuADFullExample(
                qas_id=qas_id,
                question_text=question,
                doc_tokens=doc_tokens,
                sentence_span_list=sentence_span_list,
                orig_answer_text="",
                start_position=None,
                end_position=None,
                sentence_id=None,
                is_impossible=is_impossible,
                ral_start_position=None,
                ral_end_position=None)
            examples.append(example)

        logger.info('Finish reading {} examples from {}'.format(len(examples), input_file))
        return examples

示例#6

0

显示文件

文件： fever_reader.py 项目： xhsun1997/Self-Training-MRC

    def read(self, input_file):
        logger.info(f'Reading data set from {input_file}...')
        with open(input_file, 'r') as f:
            data = json.load(f)

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F:
                return True
            return False

        examples = []
        for instance_id in tqdm(data, desc=f'Reading examples from {input_file}...', total=len(data)):
            claim = data[instance_id]['claim']
            sentence_id = data[instance_id]['evidence']
            label = data[instance_id]['label'].lower()
            passage = data[instance_id]['passage']

            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in passage:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(passage, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            if label == 'yes':
                answer_choice = 0
            elif label == 'no':
                answer_choice = 1
            else:
                raise RuntimeError(f'Wrong label for {label}')

            example = QAFullExample(
                qas_id=instance_id,
                question_text=claim,
                doc_tokens=doc_tokens,
                sentence_span_list=sentence_span_list,
                orig_answer_text="",
                start_position=None,
                end_position=None,
                sentence_id=sentence_id,
                is_impossible=answer_choice,
                ral_start_position=None,
                ral_end_position=None
            )
            examples.append(example)
        return examples

示例#7

0

显示文件

文件： coqa_reader_top_k.py 项目： xhsun1997/Self-Training-MRC

    def read(self,
             input_file,
             dialog_turns: int = 2,
             remove_evidence=False,
             remove_question=False,
             remove_passage=False,
             remove_dict=None):
        """
        :param input_file: input file to load data. The format is in CoQA style
        :param dialog_turns:  Decide how many turns' questions and answers will be appended before current question.
        """
        logger.info('Reading data set from {}...'.format(input_file))
        logger.info('Read parameters:')
        logger.info('Dialog turns: {}'.format(dialog_turns))
        logger.info('Remove evidence during test: {}'.format(remove_evidence))
        logger.info('Remove dict: {}'.format(remove_dict))
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)['data']

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(
                    ch) == 0x202F:
                return True
            return False

        if remove_dict is None:
            remove_dict = {}
        else:
            remove_dict = json.load(open(remove_dict, 'r'))
            logger.info(len(remove_dict))

        rule_labels_acc = utils.AverageMeter()

        examples = []
        for paragraph in input_data:
            paragraph_text = paragraph["story"]
            story_id = paragraph['id']
            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(
                paragraph_text, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            questions = paragraph['questions']
            answers = paragraph['answers']
            for i, (question, answer) in enumerate(zip(questions, answers)):
                question_text = question['input_text']

                # We are only concerned about questions with Yes/No as answers
                answer_type = utils.normalize_answer(answer['input_text'])
                if answer_type not in ['yes', 'no']:
                    continue

                if story_id in remove_dict and str(i +
                                                   1) in remove_dict[story_id]:
                    continue

                if answer_type == 'yes':
                    answer_choice = 0
                else:
                    answer_choice = 1

                for j in range(dialog_turns):
                    pre_idx = i - (j + 1)
                    if pre_idx >= 0:
                        question_text = questions[pre_idx][
                            'input_text'] + '<Q>' + answers[pre_idx][
                                'input_text'] + '<A>' + question_text

                qas_id = story_id + '--' + str(i + 1)

                # Add rationale start and end as extra supervised label.
                rationale_start_position = char_to_word_offset[
                    answer['span_start']]
                rationale_end_position = char_to_word_offset[answer['span_end']
                                                             - 1]

                sentence_id = utils.find_evidence_sentence(
                    sentence_span_list, rationale_start_position,
                    rationale_end_position)

                # Remove evidence sentence for experiments while evaluation only.
                if remove_evidence and sentence_id != -1 and 'train' not in input_file:
                    evi_token_s, evi_token_e = sentence_span_list[sentence_id]
                    new_doc_tokens = doc_tokens[:evi_token_s] + doc_tokens[
                        (evi_token_e + 1):]
                    rationale_start_position = rationale_end_position = -1
                    reduce_offset = evi_token_e - evi_token_s + 1
                    new_sentence_span_list = sentence_span_list[:sentence_id] + [
                        (s - reduce_offset, e - reduce_offset)
                        for s, e in sentence_span_list[(sentence_id + 1):]
                    ]
                    sentence_id = -1
                else:
                    new_doc_tokens = doc_tokens
                    new_sentence_span_list = sentence_span_list

                if 'sentence_id' in question:
                    pseudo_sentence_id = question['sentence_id']
                    if pseudo_sentence_id == sentence_id:
                        rule_labels_acc.update(1)
                    else:
                        rule_labels_acc.update(0)
                    sentence_id = pseudo_sentence_id

                # example = SQuADFullExample(
                #     qas_id=qas_id,
                #     question_text=question_text,
                #     doc_tokens=doc_tokens,
                #     sentence_span_list=sentence_span_list,
                #     orig_answer_text="",
                #     start_position=None,
                #     end_position=None,
                #     sentence_id=sentence_id,
                #     is_impossible=answer_choice,
                #     ral_start_position=rationale_start_position,
                #     ral_end_position=rationale_end_position)
                example = SQuADFullExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=new_doc_tokens,
                    sentence_span_list=new_sentence_span_list,
                    orig_answer_text="",
                    start_position=None,
                    end_position=None,
                    sentence_id=sentence_id,
                    is_impossible=answer_choice,
                    ral_start_position=rationale_start_position,
                    ral_end_position=rationale_end_position)
                examples.append(example)

        if rule_labels_acc.count > 0:
            logger.info('Read labels generated by rules.')
            logger.info(f'Accuracy of labels: {rule_labels_acc.avg}')
        return examples

示例#8

0

显示文件

    def read(self, input_file, read_state, sample_ratio: float = 0.5,
             dialog_turns: int = 2, extra_sen_file: str = None) -> List[QAFullExample]:
        """
        :param input_file: input file to load data. The format is in CoQA style
        :param read_state: If read extra sentences from CoQA dataset.
        :param sample_ratio: the ratio of negative sampling.
        :param dialog_turns:  Decide how many turns' questions and answers will be appended before current question.
        :param extra_sen_file: If read_extra_self is False, then this parameter must be specified as the way path for
            extra sentence file.
        """
        logger.info('Reading data set from {}...'.format(input_file))
        logger.info('Read parameters:')
        logger.info('Dialog turns: {}'.format(dialog_turns))
        logger.info('Read state: {}'.format(read_state))
        logger.info('Sample ratio: {}'.format(sample_ratio))
        logger.info('Extra sentence file: {}'.format(extra_sen_file))
        assert read_state in ReadState
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)['data']

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F:
                return True
            return False

        all_sentences = []
        if read_state == ReadState.SampleFromSelf:
            for paragraph in input_data:
                for sentence in self.sentence_tokenizer.tokenize(paragraph['story']):
                    sentence_tokens = whitespace_tokenize(sentence)
                    if sentence_tokens:
                        all_sentences.append(sentence_tokens)
                    else:
                        logger.warning('Empty sentence!')
                # all_sentences.extend(
                #     [whitespace_tokenize(sentence) for sentence in self.sentence_tokenizer.tokenize(paragraph['story'])])
        elif read_state == ReadState.SampleFromExternal:
            pass
        logger.info('Read extra sentences: {}'.format(len(all_sentences)))

        examples = []
        for paragraph in input_data:
            paragraph_text = paragraph["story"]
            story_id = paragraph['id']
            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(paragraph_text, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            doc_sentence_tokens = [doc_tokens[span[0]: (span[1] + 1)] for span in sentence_span_list]

            questions = paragraph['questions']
            answers = paragraph['answers']
            for i, (question, answer) in enumerate(zip(questions, answers)):
                question_text = question['input_text']

                # We are only concerned about questions with Yes/No as answers
                answer_type = utils.normalize_answer(answer['input_text'])
                if answer_type not in ['yes', 'no']:
                    continue
                if answer_type == 'yes':
                    answer_choice = 0
                else:
                    answer_choice = 1

                for j in range(dialog_turns):
                    pre_idx = i - (j + 1)
                    if pre_idx >= 0:
                        question_text = questions[pre_idx]['input_text'] + '<Q>' + answers[pre_idx][
                            'input_text'] + '<A>' + question_text

                qas_id = story_id + '--' + str(i + 1)

                # Add rationale start and end as extra supervised label.
                rationale_start_position = char_to_word_offset[answer['span_start']]
                rationale_end_position = char_to_word_offset[answer['span_end'] - 1]

                sentence_id = utils.find_evidence_sentence(sentence_span_list, rationale_start_position, rationale_end_position)

                # Add negative samples
                if read_state != ReadState.NoNegative:
                    new_doc_tokens, sentence_label, new_sentence_id, sentence_span_list, orig_token_map = \
                        utils.generate_seq_with_negative_sample(doc_sentence_tokens, all_sentences,
                                                                sample_ratio, target_index=sentence_id)
                    rationale_start_position = orig_token_map[rationale_start_position]
                    rationale_end_position = orig_token_map[rationale_end_position]
                else:
                    new_doc_tokens = doc_tokens
                    sentence_label = [0] * len(sentence_span_list)
                    new_sentence_id = sentence_id

                example = QAFullExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=new_doc_tokens,
                    sentence_span_list=sentence_span_list,
                    orig_answer_text="",
                    start_position=None,
                    end_position=None,
                    sentence_id=new_sentence_id,
                    is_impossible=answer_choice,
                    ral_start_position=rationale_start_position,
                    ral_end_position=rationale_end_position,
                    meta_data={'sentence_label': sentence_label})
                examples.append(example)
        return examples

示例#9

0

显示文件

文件： boolq_reader.py 项目： xhsun1997/Self-Training-MRC

    def read(self, input_file):
        """
        :param input_file: input file to load data. The format is in BoolQ style
        :param dialog_turns:  Decide how many turns' questions and answers will be appended before current question.
        """
        logger.info('Reading data set from {}...'.format(input_file))
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = []
            for line in reader:
                item = json.loads(line)
                item['id'] = str(len(input_data))
                input_data.append(item)

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F:
                return True
            return False

        examples = []
        for paragraph in input_data:
            paragraph_text = paragraph["passage"]
            story_id = paragraph['id']
            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(paragraph_text, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            question_text = paragraph['question']
            answer_text = ('yes' if paragraph['answer'] else 'no')

            # We are only concerned about questions with Yes/No as answers
            answer_type = answer_text
            if answer_type not in ['yes', 'no']:
                continue
            if answer_type == 'yes':
                answer_choice = 0
            else:
                answer_choice = 1

            qas_id = story_id

            sentence_id = -1
            if 'sentence_id' in paragraph:
                sentence_id = paragraph['sentence_id']
            example = BoolQFullExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                sentence_span_list=sentence_span_list,
                orig_answer_text="",
                start_position=None,
                end_position=None,
                sentence_id=sentence_id,
                is_impossible=answer_choice)
            examples.append(example)

        return examples