def split_data(self, filename):
        self.load_data(filename)
        sub_dir = filename.split('-')[0]

        # create a subdirectory for Train and Dev data
        if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
            os.makedirs(os.path.join(self.data_dir, sub_dir))

        with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.labels'), 'w', encoding="utf-8") as labels_file:

            # loop over the data
            for article_id in tqdm.tqdm(range(len(self.data['data']))):
                list_paragraphs = self.data['data'][article_id]['paragraphs']
                # loop over the paragraphs
                for paragraph in list_paragraphs:
                    context = paragraph['context']
                    context = clean_text(context)
                    context_tokens = [w for w in word_tokenize(context) if w]
                    spans = convert_idx(context, context_tokens)
                    qas = paragraph['qas']
                    # loop over Q/A
                    for qa in qas:
                        question = qa['question']
                        question = clean_text(question)
                        question_tokens = [w for w in word_tokenize(question) if w]
                        if sub_dir == "train":
                            # select only one ground truth, the top answer, if any answer
                            answer_ids = 1 if qa['answers'] else 0
                        else:
                            answer_ids = len(qa['answers'])
                        labels = []
                        if answer_ids:
                            for answer_id in range(answer_ids):
                                answer = qa['answers'][answer_id]['text']
                                answer = clean_text(answer)
                                answer_tokens = [w for w in word_tokenize(answer) if w]
                                answer_start = qa['answers'][answer_id]['answer_start']
                                answer_stop = answer_start + len(answer)
                                answer_span = []
                                for idx, span in enumerate(spans):
                                    if not (answer_stop <= span[0] or answer_start >= span[1]):
                                        answer_span.append(idx)
                                if not answer_span:
                                    continue
                                labels.append(str(answer_span[0]) + ' ' + str(answer_span[-1]))

                            # write to file
                            context_file.write(' '.join([token for token in context_tokens]) + '\n')
                            question_file.write(' '.join([token for token in question_tokens]) + '\n')
                            answer_file.write(' '.join([token for token in answer_tokens]) + '\n')
                            labels_file.write("|".join(labels) + "\n")
示例#2
0
    def split_data(self, filename):
        self.load_data(filename)
        sub_dir = filename.split('-')[0]

        # create a subdirectory for Train and Dev data
        if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
            os.makedirs(os.path.join(self.data_dir, sub_dir))

        with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.sentence'), 'w', encoding="utf-8") as sentence_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file:

            # loop over the data
            for article_id in tqdm.tqdm(range(len(self.data['data']))):
                list_paragraphs = self.data['data'][article_id]['paragraphs']
                # loop over the paragraphs
                for paragraph in list_paragraphs:
                    context = paragraph['context']
                    context = clean_text(context)
                    context_tokens = word_tokenize(context)
                    if config.paragraph and (
                            len(context_tokens) < config.min_len_context
                            or len(context_tokens) > config.max_len_context):
                        continue
                    context_sentences = sent_tokenize(context)
                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)
                    qas = paragraph['qas']
                    # loop over Q/A
                    for qa in qas:
                        question = qa['question']
                        question = clean_text(question)
                        question_tokens = word_tokenize(question)
                        if question_tokens[-1] != "?" or len(
                                question_tokens
                        ) < config.min_len_question or len(
                                question_tokens) > config.max_len_question:
                            continue
                        if sub_dir == "train":
                            # select only one ground truth, the top answer, if any answer
                            answer_ids = 1 if qa['answers'] else 0
                        else:
                            answer_ids = len(qa['answers'])
                        if answer_ids:
                            for answer_id in range(answer_ids):
                                answer = qa['answers'][answer_id]['text']
                                answer = clean_text(answer)
                                answer_tokens = word_tokenize(answer)
                                answer_start = qa['answers'][answer_id][
                                    'answer_start']
                                answer_stop = answer_start + len(answer)

                                # Getting spans of the answer in the context
                                answer_span = []
                                for idx, span in enumerate(spans):
                                    if not (answer_stop <= span[0]
                                            or answer_start >= span[1]):
                                        answer_span.append(idx)
                                if not answer_span:
                                    continue

                                # Getting the sentence where we have the answer
                                sentence_tokens = []
                                for idx, start in enumerate(
                                        first_token_sentence):
                                    if answer_span[0] >= start:
                                        sentence_tokens = context_sentences[
                                            idx]
                                        answer_sentence_span = [
                                            span - start
                                            for span in answer_span
                                        ]
                                    else:
                                        break
                                if not sentence_tokens:
                                    print("Sentence cannot be found")
                                    raise Exception()

                            # write to file
                            context_file.write(" ".join([
                                token + u"│" +
                                "1" if idx in answer_span else token + u"│" +
                                "0" for idx, token in enumerate(context_tokens)
                            ]) + "\n")
                            sentence_file.write(" ".join([
                                token + u"│" +
                                "1" if idx in answer_sentence_span else token +
                                u"│" + "0"
                                for idx, token in enumerate(sentence_tokens)
                            ]) + "\n")
                            question_file.write(
                                " ".join([token
                                          for token in question_tokens]) +
                                "\n")
                            answer_file.write(
                                " ".join([token
                                          for token in answer_tokens]) + "\n")
示例#3
0
    def split_data(self, filename):
        self.load_data(filename)

        envs = ["train", "dev"]
        for sub_dir in envs:
            # create a subdirectory for Train and Dev data
            if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
                os.makedirs(os.path.join(self.data_dir, sub_dir))

            with open(os.path.join(self.data_dir, sub_dir, sub_dir + ".context"), "w", encoding="utf-8") as context_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".sentence"), "w", encoding="utf-8") as sentence_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".question"), "w", encoding="utf-8") as question_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".answer"), "w", encoding="utf-8") as answer_file:

                # loop over the data
                for article in tqdm.tqdm(self.data["data"]):
                    context = article["text"]
                    context_tokens = word_tokenize(context)
                    context_sentences = sent_tokenize(context)
                    if config.paragraph and (
                            len(context_tokens) < config.min_len_context
                            or len(context_tokens) > config.max_len_context):
                        continue
                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)
                    if not article["type"] == sub_dir:
                        continue
                    for question in article["questions"]:
                        if question.get("isQuestionBad") == 0 and question[
                                "consensus"].get("s"):
                            q = question["q"].strip()
                            if q[-1] != "?" or len(q.split(
                            )) < config.min_len_question or len(
                                    q.split()) > config.max_len_question:
                                continue
                            answer_start = question["consensus"]["s"]
                            answer = context[question["consensus"]["s"]:
                                             question["consensus"]["e"]].strip(
                                                 ".| ").strip("\n")
                            answer_stop = answer_start + len(answer)

                            # Getting spans of the answer in the context
                            answer_span = []
                            for idx, span in enumerate(spans):
                                if not (answer_stop <= span[0]
                                        or answer_start >= span[1]):
                                    answer_span.append(idx)
                            if not answer_span:
                                continue

                            # Getting the sentence where we have the answer
                            sentence_tokens = []
                            for idx, start in enumerate(first_token_sentence):
                                if answer_span[0] >= start:
                                    sentence_tokens = context_sentences[idx]
                                    answer_sentence_span = [
                                        span - start for span in answer_span
                                    ]
                                else:
                                    break

                            # write to file
                            sent = []
                            for idx, token in enumerate(sentence_tokens):
                                if token.strip("\n").strip():
                                    if idx in answer_sentence_span:
                                        sent.append(token + u"│" + "1")
                                    else:
                                        sent.append(token + u"│" + "0")
                            sent = " ".join(sent)
                            sent = sent.strip()
                            index = sent.find("(│0 CNN│0 )│0 --│0 ")
                            if index > -1:
                                sent = sent[index +
                                            len("(│0 CNN│0 )│0 --│0 "):]

                            ctxt = []
                            for idx, token in enumerate(context_tokens):
                                if token.strip("\n").strip():
                                    if idx in answer_span:
                                        ctxt.append(token + u"│" + "1")
                                    else:
                                        ctxt.append(token + u"│" + "0")
                            ctxt = " ".join(ctxt)
                            ctxt = ctxt.strip()
                            index = ctxt.find("(│0 CNN│0 )│0 --│0 ")
                            if index > -1:
                                ctxt = ctxt[index +
                                            len("(│0 CNN│0 )│0 --│0 "):]

                            context_file.write(ctxt + "\n")
                            sentence_file.write(sent + "\n")
                            question_file.write(q + "\n")
                            answer_file.write(answer + "\n")
示例#4
0
def process_file(filename, data_type, word_counter, char_counter, ques_limit):
    """
    从文本文件中读取内容后进行初步处理。如果数据集是train的话,需要进行内容过滤
    :param filename:
    :param data_type:
    :param word_counter:
    :param char_counter:
    :return:
    """

    print("Processing {} examples...".format(data_type))
    examples = []
    eval_examples = {}
    total = 0

    with open(filename, "r") as fh:
        source = json.load(fh)
        # TODO 预处理中进行了过滤,但后续没有办法计算spans
        for article in tqdm(source):
            content = article['article_title'] + '。' + article[
                'article_content']
            content_tokens = word_tokenize(content)
            content_chars = [list(token) for token in content_tokens]
            spans = convert_idx(content, content_tokens)

            for token in content_tokens:
                word_counter[token] += len(article['questions'])
                for char in token:
                    char_counter[char] += len(article["questions"])

            for q in article['questions']:
                question_text = q["question"]
                answer_text = q['answer']
                question_tokens = word_tokenize(question_text)
                question_tokens = shrink_question_tokens(
                    question_tokens, ques_limit)

                question_chars = [list(token) for token in question_tokens]
                result = list(substring_indexes(answer_text, content))

                for token in question_tokens:
                    word_counter[token] += 1
                    for char in token:
                        char_counter[char] += 1

                if len(result) == 1:
                    # 将result的字符转换成分词之后的位置,y1 y2 分别是开始的分词位置和结束的位置
                    current_pos, start_token, end_token = 0, -1, -1
                    for token_cnt, token in enumerate(content_tokens):
                        if current_pos > result[0] and start_token == -1:
                            start_token = token_cnt - 1
                        if current_pos > result[0] + len(q["answer"]):
                            end_token = token_cnt - 2
                            break
                        current_pos += len(token)
                    total += 1
                    example = {
                        "context_tokens": content_tokens,
                        "context_chars": content_chars,
                        "ques_tokens": question_tokens,
                        "ques_chars": question_chars,
                        "y1s": [start_token],
                        "y2s": [end_token],
                        "id": total
                    }
                    eval_examples[str(total)] = {
                        "context": content,
                        "spans": spans,  # 全文的每个token与位置的对应关系
                        "answers": [answer_text],  # TODO 改成不分para的
                        "uuid": q["questions_id"]
                    }  # example中没有存储原始的问题文本信息,在这里保存了,在后续的结果展示中可以用到。
                    examples.append(example)  # 不考虑任何跨段的问题

        random.shuffle(examples)
        print("{} questions in total".format(len(examples)))
    return examples, eval_examples
    def split_sentence_question(self, filename, data_type):
        data = self.load_data(filename)
        with open(os.path.join(self.save_dir + data_type + '.sentence'), 'w', encoding="utf-8") as sentence_file,\
             open(os.path.join(self.save_dir + data_type + '.question'), 'w', encoding="utf-8") as question_file:

            artilces = data
            for article in tqdm(artilces):
                paragraphs = article['paragraphs']
                for paragraph in paragraphs:
                    context = paragraph['context']

                    context = clean_text(context)
                    context_tokens = word_tokenize(context)
                    context_sentences = sent_tokenize(context)

                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)

                    question_and_answer_list = paragraph['qas']
                    for question_and_answer in question_and_answer_list:
                        question = question_and_answer['question']
                        question = clean_text(question)
                        question_tokens = word_tokenize(question)

                        if len(question_tokens) > MAX_QUESTION_LENGTH or len(
                                question_tokens) < MIN_QUESTION_LENGHT:
                            continue

                        if not question_and_answer['answers']: continue
                        answer = question_and_answer['answers'][0]
                        answer_text = answer['text']
                        answer_text = clean_text(answer_text)
                        answer_tokens = word_tokenize(answer_text)
                        answer_start = answer['answer_start']
                        answer_stop = answer_start + len(answer_text)

                        answer_span = []
                        for idx, span in enumerate(spans):
                            if not (answer_stop <= span[0]
                                    or answer_start >= span[1]):
                                answer_span.append(idx)
                        if not answer_span: continue

                        sentence_tokens = []
                        for idx, start in enumerate(first_token_sentence):
                            if answer_span[0] >= start:
                                sentence_tokens = context_sentences[idx]
                                answer_sentence_span = [
                                    span - start for span in answer_span
                                ]
                            else:
                                break
                        if not sentence_tokens:
                            print("Sentence cannot be found")
                            raise Exception()

                        if len(sentence_tokens) > MAX_SENTENCE_LENGTH or len(
                                sentence_tokens) < MIN_SENTENCE_LENGTH:
                            continue

                        sentence_file.write(" ".join([
                            token + u"│" +
                            "1" if idx in answer_sentence_span else token +
                            u"│" + "0"
                            for idx, token in enumerate(sentence_tokens)
                        ]) + "\n")
                        question_file.write(
                            " ".join([token
                                      for token in question_tokens]) + "\n")