Python whitespace_tokenize 예제들, pytorch_pretrained_bert.tokenization.whitespace_tokenize Python 예제들

예제 #1

0

파일 보기

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = self._clean_text(text)
        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)
        orig_tokens = btok.whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:

            # pass MASK forward
            if MASK in token:
                split_tokens.append(MASK)
                if token != MASK:
                    remaining_chars = token.replace(MASK, "").strip()
                    if remaining_chars:
                        split_tokens.append(remaining_chars)
                continue

            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = btok.whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

예제 #2

0

파일 보기

파일: squad_data_utils.py 프로젝트: HeGuanyuan/BERT-for-RRC-ABSA

def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []

    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                if is_training:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset + answer_length - 1]

                    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                        continue

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position)
                examples.append(example)
    return examples

예제 #3

0

파일 보기

 def tokenize(self, text):
     orig_tokens = tokenization.whitespace_tokenize(text)
     split_tokens = []
     for token in orig_tokens:
         for sub_token in self.wordpiece_tokenizer.tokenize(token):
             split_tokens.append(sub_token)
     return split_tokens

예제 #4

0

파일 보기

def read_many_examples(input_file, is_training):
    '''who was the american in space ?	in space'''
    lines_list = span_utils.read_cols_lines(input_file=input_file)
    examples = []
    for i in range(len(lines_list)):
        paragraph_text = lines_list[i][0]
        answer_text = lines_list[i][1]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if span_utils.is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        if is_training:
            qas_id = 'train_' + str(i)
        else:
            qas_id = 'test_' + str(i)

        # question_text = 'abc'  #no use
        start_position = None
        end_position = None
        orig_answer_text = None
        if is_training:
            # if len(answer_text) != 1:
            #     raise ValueError("For training, each question should have exactly 1 answer.")
            orig_answer_text = answer_text
            answer_offset = paragraph_text.find(answer_text)  # answer_start
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            print(paragraph_text, '\t', answer_text)
            end_position = char_to_word_offset[answer_offset + answer_length - 1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text))
            if actual_text.find(cleaned_answer_text) == -1:
                continue
        else:
            orig_answer_text = answer_text
        example = SequenceExample(
            qas_id=qas_id,
            doc_tokens=doc_tokens,
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position)
        examples.append(example)
    return examples

예제 #5

0

파일 보기

파일: utils.py 프로젝트: emrQA/bionlp_acl20

def get_start_end_and_tokens(paragraph_text,
                             question_text,
                             orig_answer_text,
                             answer_offset,
                             tokenizer,
                             improve_flag=False):
    doc_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True
    for c in paragraph_text:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

    start_position = None
    end_position = None
    answer_length = len(orig_answer_text)
    start_position = char_to_word_offset[answer_offset]
    end_position = char_to_word_offset[answer_offset + answer_length - 1]
    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
    cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text))

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    tok_start_position = None
    tok_end_position = None
    if True:
        tok_start_position = orig_to_tok_index[start_position]
        if end_position < len(doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1

    if (improve_flag):
        tok_start_position, tok_end_position = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
            orig_answer_text)

    # Adjustment
    tokenized_question = tokenizer.tokenize(question_text)
    tokenized_para = tokenizer.tokenize(paragraph_text)
    tok_start_position = tok_start_position + 2 + len(
        tokenized_question)  # added 2 for [CLS] and [SEP] token
    tok_end_position = tok_end_position + 2 + len(tokenized_question)
    return tokenized_question, tokenized_para, tok_start_position, tok_end_position

예제 #6

0

파일 보기

파일: tokenization_offsets.py 프로젝트: Perf-Org-5KRepos/translucent-answer-prediction

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = self._clean_text(text)
        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case and token not in self.never_split:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

예제 #7

0

파일 보기

def icd9_tokenizer_style(vocab, text_a):
    # we follow same tokenization approach in original paper
    # @Vocab is some object that convert words to index exactly in the order of the pretrained word vectors.
    # use Vocab = load_vocab('/local/datdb/MIMIC3database/format10Jan2019/vocab+icd_index_map.txt')
    # @text_a can be split by space (in case of preprocessed icd9 notes)
    tokens_a = whitespace_tokenize(text_a)
    input_ids = []
    for token in tokens_a:
        if token in vocab:
            input_ids.append(vocab[token])
        else:
            input_ids.append(1)  # unknown
    return input_ids, len(input_ids)

예제 #8

0

파일 보기

파일: tokenization_offsets.py 프로젝트: Perf-Org-5KRepos/translucent-answer-prediction

    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.

        Returns:
          A list of wordpiece tokens.
        """

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

예제 #9

0

파일 보기

파일: run_headword_span.py 프로젝트: nju-websoft/SkeletonKBQA

def read_many_examples(input_file, is_training):
    '''2019.06.19'''
    lines_list = span_utils.read_cols_lines(input_file=input_file)
    examples = []
    for i in range(len(lines_list)):
        line_list = lines_list[i]
        paragraph_text = line_list[0]
        question_text = line_list[1]
        answer_text = line_list[2]

        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if span_utils.is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)
        if is_training:
            qas_id = 'train_' + str(i)
        else:
            qas_id = 'test_' + str(i)

        start_position = None
        end_position = None
        orig_answer_text = None
        if is_training:
            if len(answer_text) == 0:
                raise ValueError('For training, each question should have exactly 1 answer.')
            orig_answer_text = answer_text

            # answer_offset = paragraph_text.find(answer_text)
            answer_offset = span_utils.duplicate_word(paragraph_text=paragraph_text, span=question_text, headword=answer_text)

            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length - 1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text))
            if actual_text.find(cleaned_answer_text) == -1:
                continue
        else:
            orig_answer_text = answer_text

        example = SquadExample(
            qas_id=qas_id,
            question_text=question_text,
            doc_tokens=doc_tokens,
            doc_char_to_word_offset=char_to_word_offset,
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position)
        examples.append(example)
    return examples

예제 #10

0

파일 보기

파일: run_squad_dataset_utils.py 프로젝트: qinzzz/Bert-Graph

def read_squad_examples(input_file, is_training, version_2_with_negative):
    a = 0
    quest = []
    abstract = []
    abstract_node = []  # 新加
    answer = []

    with open(input_file, encoding='utf-8') as f:
        content = f.read()
        text = json.loads(content)
        for content in text:

            for key in content:
                if key != "Abstract" and key != "am_id" and key != "transH" and key != "transE" and key != "metapth2vec":
                    abstract.append(content["Abstract"])
                    if content["am_id"] == None:
                        abstract_node.append([0] * 100)
                    else:
                        abstract_node.append(content["metapth2vec"])  # 新加
                    #print(type(content["transH"]))     # list
                    #print(content["transH"])
                    quest.append(questionbox[key])
                    answer.append(content[key])
    total = len(quest)

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    i = 0
    while i < total:
        paragraph_text = abstract[i]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        #对句子作token,去除空格
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)
        qas_id = str(i)
        question_text = quest[i]
        node = abstract_node[i]  # 新加
        start_position = None
        end_position = None
        orig_answer_text = None
        is_impossible = False
        if is_training:
            if version_2_with_negative:
                if answer[i] == '':
                    is_impossible = True
            if not is_impossible:
                #answer = qa["answers"][0]
                orig_answer_text = answer[i]
                answer_offset = paragraph_text.find(orig_answer_text)
                if (answer_offset == -1):
                    print('-----------------------------')
                    print(i, ' hehe')
                    print(paragraph_text)
                    print(orig_answer_text)
                    print(type(orig_answer_text))
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset +
                                                   answer_length - 1]
                # Only add answers where the text can be exactly recovered from the
                # document. If this CAN'T happen it's likely due to weird Unicode
                # stuff so we will just skip the example.
                #
                # Note that this means for training mode, every example is NOT
                # guaranteed to be preserved.
                actual_text = " ".join(
                    doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    a += 1
                    logger.warning("Could not find answer: '%s' vs. '%s'",
                                   actual_text, cleaned_answer_text)
                    i += 1
                    continue
            else:
                start_position = -1
                end_position = -1
                orig_answer_text = ""

        example = SquadExample(
            qas_id=qas_id,
            question_text=question_text,
            doc_tokens=doc_tokens,
            abstract_node=node,  # 新加
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position,
            is_impossible=is_impossible)
        examples.append(example)
        i += 1
    print(a)
    return examples

예제 #11

0

파일 보기

def read_thai_qa_examples(input_file, is_training):
    """Read a Thai QA pickle file into a list of ThaiQAExample."""

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    input_data = pd.read_pickle(input_file)

    examples = []
    for _, entry in input_data.iterrows():
        start_position = None
        end_position = None
        paragraph_text = entry['paragraph'].replace('\n', '')
        question_id = entry['question_id']
        question_text = entry['question']
        
        try:
            orig_answer_text = entry['answer']
        except:
            orig_answer_text = None
        
        if entry['lang'] == 'thai':
            doc_tokens = word_tokenize(paragraph_text, engine='ulmfit')
            if is_training:
                char_to_word_offset = []
                for token_index, token in enumerate(doc_tokens):
                    for c in token:
                        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
                            char_to_word_offset.append(token_index - 1)
                        else:
                            char_to_word_offset.append(token_index)

                answer_offset = entry['start_pos']
                orig_answer_text = entry['answer']
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset + answer_length - 1]

        elif entry['lang'] != 'thai':
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            if is_training:
                answer_offset = entry['start_pos']
                answer_length = len(orig_answer_text)
                try:
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset + answer_length - 1]
                except:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""
                # Only add answers where the text can be exactly recovered from the
                # document. If this CAN'T happen it's likely due to weird Unicode
                # stuff so we will just skip the example.
                #
                # Note that this means for training mode, every example is NOT
                # guaranteed to be preserved.
                actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    logger.warning("Could not find answer: '%s' vs. '%s'",
                                    actual_text, cleaned_answer_text)
                    continue

        example = ThaiQAExample(
            qas_id=question_id,
            question_text=question_text,
            doc_tokens=doc_tokens,
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position)
        examples.append(example)
    return examples

예제 #12

0

파일 보기

def read_squad_examples(input_file,
                        is_training=True,
                        version_2_with_negative=True):
    """
    :param input_file: 待读取文件训练集路径
    :param is_training: 是否为训练集
    :return:
    """
    with open(input_file, "r", encoding='utf-8') as reader:

        # dataset: 数据集中所有案例
        dataset = json.load(reader)["data"]
        #print(len(dataset)) #2000

        examples = []

        # 分析提取dataset中的每个案例
        for item in dataset:
            for paragraph in item['paragraphs']:
                content_text = paragraph['context']
                qas = paragraph['qas']

                doc_tokens = []
                char_to_word_offset = []

                # doc_token是
                for word in content_text:
                    doc_tokens.append(word)
                    char_to_word_offset.append(len(doc_tokens) - 1)

                # qas是一个案例下的所有问题，qa是所有问题中的一个问题
                for qa in qas:
                    qa_id = qa['id']
                    question_text = qa['question']

                    # 参数自定义
                    start_position = None
                    end_position = None
                    answer = None
                    is_impossible = False
                    is_yes = False
                    is_no = False

                    # 假如读入的数据集属于训练数据
                    if is_training:
                        if version_2_with_negative:
                            if qa['is_impossible'] == 'false':
                                is_impossible = False
                            else:
                                is_impossible = True

                        # for training, each question should have exactly 1 answer
                        if (len(qa['answers']) != 1) and (not is_impossible):
                            continue

                        if not is_impossible:
                            ans = qa['answers'][0]
                            answer = ans['text']

                            answer_start = ans['answer_start']
                            answer_length = len(answer)
                            start_position = char_to_word_offset[answer_start]
                            end_position = char_to_word_offset[answer_start +
                                                               answer_length -
                                                               1]
                            real_answer = "".join(
                                doc_tokens[start_position:end_position + 1])

                            clean_answer = " ".join(
                                whitespace_tokenize(answer))

                            # 如果抽取出来的答案 与 材料提供的数据 不能匹配
                            if real_answer.find(clean_answer) == -1:
                                if (clean_answer == 'YES'):
                                    is_yes = True
                                    answer = 'YES'
                                    start_position = -1
                                    end_position = -1
                                elif clean_answer == 'NO':
                                    is_no = True
                                    answer = 'NO'
                                    start_position = -1
                                    end_position = -1
                                else:
                                    logger.warning(
                                        "could not find answer: '%s' vs. '%s'",
                                        real_answer, clean_answer)
                                    continue
                        else:
                            start_position = -1
                            end_position = -1
                            answer = ""

                    # if training
                    example = SquadExample(
                        qa_id=qa_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        answer=answer,
                        start_position=start_position,
                        end_position=end_position,
                        is_impossible=is_impossible,
                        is_yes=is_yes,
                        is_no=is_no,
                    )
                    examples.append(example)
                    """
                    example's key: qa_id: ... , question: ... 
                    """
                # for qa in qas
            # for paragraph
        # for item
        return examples

예제 #13

0

파일 보기

파일: run_squad.py 프로젝트: initc/nlp-bert

def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    # wikipidia的一篇文章
    for entry in input_data:
        # 一篇文章中的一段内容
        for paragraph in entry["paragraphs"]:
            # 文章的具体内容
            paragraph_text = paragraph["context"]
            doc_tokens = []
            # 从char的index到word的index，需要考虑空白的影响
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                # 每次读一个char都需要记录一次
                char_to_word_offset.append(len(doc_tokens) - 1)
            # 遍历每一个 问题-答案
            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                # 问题的内容
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                if is_training:
                    # 训练数据集只有一个答案
                    # dev数据集每一个问题有三个答案，但是有些答案是相同的
                    if len(qa["answers"]) != 1:
                        raise ValueError(
                            "For training, each question should have exactly 1 answer.")
                    # 答案
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    # 答案开始位置
                    answer_offset = answer["answer_start"]
                    # 答案的字符长度
                    answer_length = len(orig_answer_text)
                    # 答案开始的token位置
                    start_position = char_to_word_offset[answer_offset]
                    # 答案接受的token的位置
                    end_position = char_to_word_offset[answer_offset + answer_length - 1]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.

                    # 提取出来的答案token
                    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])

                    # 这个是真的原始答案
                    # whitespace_tokenize 仅仅去除了空格，然后split一个array
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        # 应该不会出现吧
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                           actual_text, cleaned_answer_text)
                        continue

                example = SquadExample(
                    qas_id=qas_id, # 问答对的唯一id
                    question_text=question_text, # 问题字符串
                    doc_tokens=doc_tokens, # passage的token数组
                    orig_answer_text=orig_answer_text, # 原始文本字符串
                    start_position=start_position, # 开始位置，在token数组中的位置
                    end_position=end_position) # 结束位置，在token数组中的位置
                examples.append(example)
    return examples

예제 #14

0

파일 보기

파일: bert_utils_for_exam.py 프로젝트: gyuhyeonsim/0811_NLP_QA

def parse_json_squad(input_data, is_train):
    """Read a SQuAD json file into a list of SquadExample."""
    examples = list()
    for data_entry in input_data:
        for paragraph in data_entry['paragraphs']:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            # Q1. doc_tokens에 whitespace(c)를 가지고 context를 토큰화하는 코드를 작성하세요.
            ###################################################################################################
            for char in paragraph_text:
                if is_whitespace(char):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(char)
                    else:
                        doc_tokens[-1] += char
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)  # Which word is the character in?
            ###################################################################################################

            for qa in paragraph["qas"]:
                """
                {'answers': [{'answer_start', 'text'}], 'question', 'id'}
                """
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None

                if is_train:
                    if len(qa["answers"]) != 1:
                        raise ValueError(
                            "For training, each question should have exactly 1 answer.")

                    # Q2. Line 34의 변수를 참고하여 Line 70: SquadExample의 instance를 만들기 위한 파라미터를 채우세요.
                    ###################################################################################################
                    qas_id = qa["id"]  # fill the black -> assign None
                    question_text = qa["question"]  # fill the black # index of word
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]  # index of word
                    end_position = char_to_word_offset[answer_offset + answer_length - 1]  # index of word
                    ###################################################################################################

                    # CODE FOR Handling exceptions
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(whitespace_tokenize(
                        orig_answer_text))  # segment words from the sentense including the white space
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                        continue

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,  # a set of tokens(words) in the
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position)
                examples.append(example)
    print("success to convert input data into a set of {} examples".format(len(examples)))
    return examples

예제 #15

0

파일 보기

def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    # read all the data in the train file
    for num, entry in enumerate(tqdm(input_data, desc="Data")):
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]

            # Added by Yue, extracting features from paragraph text
            paragraph_features = extract_feature_matrix(paragraph_text)

            # paragraph_text is the paragraph context
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                # for a given question, we have id, start position, end_postion
                qas_id = qa["id"]
                question_text = qa["question"]

                # Added by Yue, extracting features from quetsion text
                question_features = extract_feature_matrix(question_text)

                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                total_features = concatenate_features(question_features,
                                                      paragraph_features)

                # example is the original for a certain example
                # containning, qas_id, question_text,
                example = SquadExample(
                    # Added by Yue
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=is_impossible,
                    ling_features=total_features)
                examples.append(example)
    return examples

예제 #16

0

파일 보기

파일: data.py 프로젝트: lixinsu/multi_span

def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens, char_to_word_offset = split_by_space(paragraph_text)
            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position,
                                       is_impossible=is_impossible)
                examples.append(example)
    return examples

예제 #17

0

파일 보기

 def tokenize(self, text):
     return whitespace_tokenize(text)

예제 #18

0

파일 보기

def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    """The following is the arch of the element in the list:
    
        ```
        {
        "title": "University_of_Notre_Dame",
        "paragraphs": [
            {
                "context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is 
                a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper 
                statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is 
                the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer 
                and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared 
                to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects 
                through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.",
                "qas": [
                    {
                        "answers": [
                            {
                                "answer_start": 515,
                                "text": "Saint Bernadette Soubirous"
                            }
                        ],
                        "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?",
                        "id": "5733be284776f41900661182"
                    },
                    
                    ... ...
                
                ]
            },
            
            ... ...
        
        }
            ```
    """
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]  # list, len=442

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                if is_training:
                    if len(qa["answers"]) != 1:
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                        continue

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position)
                examples.append(example)
    return examples

예제 #19

0

파일 보기

    def read(self, input_file, read_state, sample_ratio: float = 0.5,
             dialog_turns: int = 2, extra_sen_file: str = None) -> List[QAFullExample]:
        """
        :param input_file: input file to load data. The format is in CoQA style
        :param read_state: If read extra sentences from CoQA dataset.
        :param sample_ratio: the ratio of negative sampling.
        :param dialog_turns:  Decide how many turns' questions and answers will be appended before current question.
        :param extra_sen_file: If read_extra_self is False, then this parameter must be specified as the way path for
            extra sentence file.
        """
        logger.info('Reading data set from {}...'.format(input_file))
        logger.info('Read parameters:')
        logger.info('Dialog turns: {}'.format(dialog_turns))
        logger.info('Read state: {}'.format(read_state))
        logger.info('Sample ratio: {}'.format(sample_ratio))
        logger.info('Extra sentence file: {}'.format(extra_sen_file))
        assert read_state in ReadState
        with open(input_file, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)['data']

        def is_whitespace(ch):
            if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F:
                return True
            return False

        all_sentences = []
        if read_state == ReadState.SampleFromSelf:
            for paragraph in input_data:
                for sentence in self.sentence_tokenizer.tokenize(paragraph['story']):
                    sentence_tokens = whitespace_tokenize(sentence)
                    if sentence_tokens:
                        all_sentences.append(sentence_tokens)
                    else:
                        logger.warning('Empty sentence!')
                # all_sentences.extend(
                #     [whitespace_tokenize(sentence) for sentence in self.sentence_tokenizer.tokenize(paragraph['story'])])
        elif read_state == ReadState.SampleFromExternal:
            pass
        logger.info('Read extra sentences: {}'.format(len(all_sentences)))

        examples = []
        for paragraph in input_data:
            paragraph_text = paragraph["story"]
            story_id = paragraph['id']
            doc_tokens = []
            prev_is_whitespace = True
            char_to_word_offset = []
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            # Split context into sentences
            sentence_start_list, sentence_end_list = utils.split_sentence(paragraph_text, self.sentence_tokenizer)
            sentence_span_list = []
            for c_start, c_end in zip(sentence_start_list, sentence_end_list):
                t_start = char_to_word_offset[c_start]
                t_end = char_to_word_offset[c_end]
                sentence_span_list.append((t_start, t_end))

            doc_sentence_tokens = [doc_tokens[span[0]: (span[1] + 1)] for span in sentence_span_list]

            questions = paragraph['questions']
            answers = paragraph['answers']
            for i, (question, answer) in enumerate(zip(questions, answers)):
                question_text = question['input_text']

                # We are only concerned about questions with Yes/No as answers
                answer_type = utils.normalize_answer(answer['input_text'])
                if answer_type not in ['yes', 'no']:
                    continue
                if answer_type == 'yes':
                    answer_choice = 0
                else:
                    answer_choice = 1

                for j in range(dialog_turns):
                    pre_idx = i - (j + 1)
                    if pre_idx >= 0:
                        question_text = questions[pre_idx]['input_text'] + '<Q>' + answers[pre_idx][
                            'input_text'] + '<A>' + question_text

                qas_id = story_id + '--' + str(i + 1)

                # Add rationale start and end as extra supervised label.
                rationale_start_position = char_to_word_offset[answer['span_start']]
                rationale_end_position = char_to_word_offset[answer['span_end'] - 1]

                sentence_id = utils.find_evidence_sentence(sentence_span_list, rationale_start_position, rationale_end_position)

                # Add negative samples
                if read_state != ReadState.NoNegative:
                    new_doc_tokens, sentence_label, new_sentence_id, sentence_span_list, orig_token_map = \
                        utils.generate_seq_with_negative_sample(doc_sentence_tokens, all_sentences,
                                                                sample_ratio, target_index=sentence_id)
                    rationale_start_position = orig_token_map[rationale_start_position]
                    rationale_end_position = orig_token_map[rationale_end_position]
                else:
                    new_doc_tokens = doc_tokens
                    sentence_label = [0] * len(sentence_span_list)
                    new_sentence_id = sentence_id

                example = QAFullExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=new_doc_tokens,
                    sentence_span_list=sentence_span_list,
                    orig_answer_text="",
                    start_position=None,
                    end_position=None,
                    sentence_id=new_sentence_id,
                    is_impossible=answer_choice,
                    ral_start_position=rationale_start_position,
                    ral_end_position=rationale_end_position,
                    meta_data={'sentence_label': sentence_label})
                examples.append(example)
        return examples

예제 #20

0

파일 보기

fin = open("/u/scratch/d/datduong/w2vModel1Gram9Jan2019/vocab.txt", "r")
counter = 0
for line in tqdm(fin):
    if counter == 0:
        counter = 1  # skip header
        continue
    pubmed_vocab.append(line.split()[0])

fin.close()
pubmed_vocab = set(pubmed_vocab)

## read in def, do white space, intersect with pubmed
GOdb_vocab = []
GOdb = pd.read_csv("go_def_in_obo.tsv", sep="\t")
for defin in list(GOdb['def']):
    token = whitespace_tokenize(defin)
    token = list(set(token))
    GOdb_vocab = GOdb_vocab + token

GOdb_vocab = set(GOdb_vocab)

GOdb_vocab = GOdb_vocab.intersection(pubmed_vocab)
GOdb_vocab = list(GOdb_vocab)
GOdb_vocab.sort()
GOdb_vocab = ['[PAD]', '[UNK]'] + GOdb_vocab  ## ADD PADDING

fout = open('word_pubmed_intersect_GOdb.txt', 'w')
fout.write("\n".join(s for s in GOdb_vocab))
fout.close()

## create init embed.

예제 #21

0

파일 보기

파일: preprocess_squad.py 프로젝트: malikaltakrori/qags-1

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--inputs',
        required=True,
        nargs='+',
        help='files to process.',
    )
    parser.add_argument(
        '--output',
        required=True,
        metavar='DIR',
        help='Path for output',
    )

    args = parser.parse_args()
    print(args)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def process(s):
        try:
            return tokenizer.tokenize(s)
        except:
            print('failed on', s)
            raise

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False


    for inp in args.inputs:
        bad_qs = 0
        num_qs = 0
        filename = os.path.basename(inp)
        base_filename = os.path.splitext(filename)[0]
        s1_filename = base_filename + '_1.txt'
        s2_filename = base_filename + '_2.txt'
        s3_filename = base_filename + '_3.txt'
        s4_filename = base_filename + '_4.txt'
        id_filename = base_filename + '.id'
        label_filename = base_filename + '.lbl'
        with open(inp, 'r') as f_in, open(os.path.join(args.output, s1_filename), 'w') as s1_out, open(os.path.join(args.output, s2_filename), 'w') as s2_out, open(os.path.join(args.output, id_filename), 'w') as id_out, open(os.path.join(args.output, label_filename), 'w') as lbl_out, open(os.path.join(args.output, s3_filename), 'w') as s3_out,open(os.path.join(args.output, s4_filename), 'w') as s4_out:
            data = json.load(f_in)
            for example in data['data']:
                for p in example['paragraphs']:
                    context = p['context']
                    doc_tokens = []
                    char_to_word_offset = []
                    prev_is_whitespace = True
                    for c in context:
                        if is_whitespace(c):
                            prev_is_whitespace = True
                        else:
                            if prev_is_whitespace:
                                doc_tokens.append(c)
                            else:
                                doc_tokens[-1] += c
                            prev_is_whitespace = False
                        char_to_word_offset.append(len(doc_tokens) - 1)

                    orig_to_tok_index = []
                    tok_to_orig_index = []
                    all_doc_tokens = []
                    for (i, token) in enumerate(doc_tokens):
                        orig_to_tok_index.append(len(all_doc_tokens))
                        sub_tokens = process(token)
                        for sub_token in sub_tokens:
                            tok_to_orig_index.append(i)
                            all_doc_tokens.append(sub_token)

                    for qa in p['qas']:
                        num_qs += 1
                        q = process(qa['question'])
                        is_impossible = True #qa['is_impossible']
                        answer = qa['answers'][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
                        
                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning("Could not find answer: '%s' vs. '%s'",
                                           actual_text, cleaned_answer_text)
                            continue
                        tok_start_position = orig_to_tok_index[start_position]
                        if end_position < len(doc_tokens) - 1:
                            tok_end_position = orig_to_tok_index[end_position + 1] - 1
                        else:
                            tok_end_position = len(all_doc_tokens) - 1

                        (tok_start_position, tok_end_position) = _improve_answer_span(
                             all_doc_tokens, tok_start_position, tok_end_position, process,
                             orig_answer_text)
                        if not is_impossible:
                            # print('bad question:', str(q))
                            bad_qs += 1
                            continue
                        print(' '.join(all_doc_tokens), file=s1_out)
                        print(' '.join(q), file=s2_out)
                        print(' '.join(doc_tokens), file=s3_out)
                        print(' '.join([str(ii) for ii in tok_to_orig_index]), file=s4_out)
                        print(qa['id'], file=id_out)
                        lbl_str = f'{int(is_impossible)}'
                        lbl_str += f' {tok_start_position} {tok_end_position}'
                        print(lbl_str, file=lbl_out)
        print('bad questions:', bad_qs, 'out of', num_qs)

예제 #22

0

파일 보기

def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                if is_training:
                    if len(qa["answers"]) != 1:
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                        continue

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position)
                examples.append(example)
    return examples

예제 #23

0

파일 보기

파일: utils.py 프로젝트: vinhngx/bert-vietnamese-question-answering

def read_squad_examples(input_data, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
    if type(input_data) == str:
        with open(input_data, "r", encoding='utf-8') as reader:
            input_data = json.load(reader)["data"]
    elif type(input_data) == dict:
        question = input_data['question']
        paragraphs = input_data['paragraphs']
        examples = []
        for p in paragraphs:
            examples.append({
                'context':
                p,
                'qas': [{
                    u'answers': [],
                    u'id': uuid.uuid4().hex,
                    u'question': question,
                    'is_impossible': True,
                    'plausible_answers': []
                }]
            })

        input_data = [{'title': question, 'paragraphs': examples}]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            #                             logger.warning("Could not find answer: '%s' vs. '%s'",
                            #                                            actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position,
                                       is_impossible=is_impossible)
                examples.append(example)
    return examples