Пример #1
0
    def preprocess(self, dataset_label):
        file_name = self.train_file if dataset_label == 'train' else (
            self.dev_file if dataset_label == 'dev' else self.test_file)
        output_file_name = os.path.join(
            self.spacyDir,
            self.data_prefix + dataset_label + '-preprocessed.json')

        print('Preprocessing', dataset_label, 'file:', file_name)
        print('Loading json...')
        with open(file_name, 'r') as f:
            dataset = json.load(f)

        print('Processing json...')

        dict1 = ['where', 'when', 'who']
        data = []
        tot = len(dataset['data'])
        type1 = type2 = 0
        for data_idx in tqdm(range(tot)):
            datum = dataset['data'][data_idx]
            context_str = datum['story']
            _datum = {
                'context': context_str,
                'source': datum['source'],
                'id': datum['id']
            }

            nlp_context = nlp(pre_proc(context_str))
            _datum['annotated_context'] = self.process(nlp_context)
            _datum['raw_context_offsets'] = self.get_raw_context_offsets(
                _datum['annotated_context']['word'], context_str)
            _datum['qas'] = []

            assert len(datum['questions']) == len(datum['answers'])

            for i in range(len(datum['questions'])):
                question, answer = datum['questions'][i], datum['answers'][i]
                assert question['turn_id'] == answer['turn_id']

                idx = question['turn_id']
                _qas = {
                    'turn_id': idx,
                    'question': question['input_text'],
                    'answer': answer['input_text']
                }

                _qas['annotated_question'] = self.process(
                    nlp(pre_proc(question['input_text'])))

                _qas['annotated_answer'] = self.process(
                    nlp(pre_proc(answer['input_text'])))
                _qas['raw_answer'] = answer['input_text']
                _qas['span_text'] = answer['span_text']

                tmp = _qas['raw_answer']
                tmp = self.removePunctuation(tmp)
                if _qas['raw_answer'] in context_str or tmp.lower() in [
                        "yes", "no", "unknown"
                ]:
                    type1 += 1
                    _qas['answer_type'] = "extractive"
                else:
                    type2 += 1
                    _qas['answer_type'] = "generative"
                _qas['answer_span_start'] = answer['span_start']
                _qas['answer_span_end'] = answer['span_end']

                sign = ""
                ques = question['input_text'].lower()
                real_ans = answer['input_text'].lower()
                real = self.remove_punctual(real_ans)
                real = real.split()

                for word in dict1:
                    if word in ques or ques[:
                                            3] == "was" or ques[:
                                                                4] == 'were' or ques[:
                                                                                     2] == 'is':
                        sign = "factual"
                        break

                if len(real) <= 4:
                    sign = "factual"
                if not sign or real_ans == "no" or real_ans == "yes" or real_ans == 'unknown':
                    sign = "factual"

                _qas['question_type'] = sign

                start = answer['span_start']  #rational 范围
                end = answer['span_end']
                chosen_text = _datum['context'][start:end].lower()
                while len(chosen_text) > 0 and chosen_text[
                        0] in string.whitespace:  #判断开头的空白符 \t,\n等6种
                    chosen_text = chosen_text[1:]
                    start += 1
                while len(chosen_text) > 0 and chosen_text[
                        -1] in string.whitespace:  # 判断结尾的空白符
                    chosen_text = chosen_text[:-1]
                    end -= 1
                input_text = _qas['answer'].strip().lower()
                if input_text in chosen_text:
                    p = chosen_text.find(input_text)  # p:input_text的起始值
                    _qas['answer_span'] = self.find_span(
                        _datum['raw_context_offsets'], start + p,
                        start + p + len(input_text))
                else:
                    _qas['answer_span'] = self.find_span_with_gt(
                        _datum['context'], _datum['raw_context_offsets'],
                        input_text)

                _datum['qas'].append(_qas)
            data.append(_datum)

        # build vocabulary
        if dataset_label == 'train':
            print('Build vocabulary from training data...')
            contexts = [_datum['annotated_context']['word'] for _datum in data]
            qas = [
                qa['annotated_question']['word'] +
                qa['annotated_answer']['word'] for qa in _datum['qas']
                for _datum in data
            ]
            self.train_vocab = self.build_vocab(contexts, qas)

        print('Getting word ids...')
        w2id = {w: i for i, w in enumerate(self.train_vocab)}
        for _datum in data:
            _datum['annotated_context']['wordid'] = token2id_sent(
                _datum['annotated_context']['word'],
                w2id,
                unk_id=1,
                to_lower=False)
            #new modify, get wordid
            for qa in _datum['qas']:
                qa['annotated_question']['wordid'] = token2id_sent(
                    qa['annotated_question']['word'],
                    w2id,
                    unk_id=1,
                    to_lower=False)
                qa['annotated_answer']['wordid'] = token2id_sent(
                    qa['annotated_answer']['word'],
                    w2id,
                    unk_id=1,
                    to_lower=False)

        if dataset_label == 'train':
            # get the condensed dictionary embedding
            print('Getting embedding matrix for ' + dataset_label)
            embedding = build_embedding(self.glove_file, self.train_vocab,
                                        self.glove_dim)
            meta = {'vocab': self.train_vocab, 'embedding': embedding.tolist()}
            meta_file_name = os.path.join(self.spacyDir,
                                          dataset_label + '_meta.msgpack')
            print('Saving meta information to', meta_file_name)
            with open(meta_file_name, 'wb') as f:
                # msgpack.dump(meta, f, encoding='utf8')
                msgpack.dump(meta, f)

        dataset['data'] = data

        if dataset_label == 'test':
            return dataset

        with open(output_file_name, 'w') as output_file:
            json.dump(dataset, output_file, sort_keys=True, indent=4)
        print("The amount of extractive qa is: ", type1)
        print("The amount of generative qa is: ", type2)
Пример #2
0
    def preprocess(self, dataset_label):
        file_name = self.train_file if dataset_label == 'train' else (self.dev_file if dataset_label == 'dev' else self.test_file)
        output_file_name = os.path.join(self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json')

        print('Preprocessing', dataset_label, 'file:', file_name)
        print('Loading json...')
        with open(file_name, 'r') as f:
            dataset = json.load(f)

        print('Processing json...')

        data = []
        tot = len(dataset['data'])
        for data_idx in tqdm(range(tot)):
            datum = dataset['data'][data_idx]
            context_str = datum['story']
            _datum = {'context': context_str,
                      'source': datum['source'],
                      'id': datum['id'],
                      'filename': datum['filename']}

            nlp_context = nlp(pre_proc(context_str))
            _datum['annotated_context'] = self.process(nlp_context)
            _datum['raw_context_offsets'] = self.get_raw_context_offsets(_datum['annotated_context']['word'], context_str)
            _datum['qas'] = []
            assert len(datum['questions']) == len(datum['answers'])

            additional_answers = {}
            if 'additional_answers' in datum:
                for k, answer in datum['additional_answers'].items():
                    if len(answer) == len(datum['answers']):
                        for ex in answer:
                            idx = ex['turn_id']
                            if idx not in additional_answers:
                                additional_answers[idx] = []
                            additional_answers[idx].append(ex['input_text']) # additional_answer is only used to eval, so raw_text is fine

            for i in range(len(datum['questions'])):
                question, answer = datum['questions'][i], datum['answers'][i]
                assert question['turn_id'] == answer['turn_id']

                idx = question['turn_id']
                _qas = {'turn_id': idx,
                        'question': question['input_text'],
                        'answer': answer['input_text']}
                if idx in additional_answers:
                    _qas['additional_answers'] = additional_answers[idx]

                _qas['annotated_question'] = self.process(nlp(pre_proc(question['input_text'])))
                _qas['annotated_answer'] = self.process(nlp(pre_proc(answer['input_text'])))
                _qas['raw_answer'] = answer['input_text']
                _qas['answer_span_start'] = answer['span_start']
                _qas['answer_span_end'] = answer['span_end']

                start = answer['span_start']
                end = answer['span_end']
                chosen_text = _datum['context'][start: end].lower()
                while len(chosen_text) > 0 and chosen_text[0] in string.whitespace:
                    chosen_text = chosen_text[1:]
                    start += 1
                while len(chosen_text) > 0 and chosen_text[-1] in string.whitespace:
                    chosen_text = chosen_text[:-1]
                    end -= 1
                input_text = _qas['answer'].strip().lower()
                if input_text in chosen_text:
                    p = chosen_text.find(input_text)
                    _qas['answer_span'] = self.find_span(_datum['raw_context_offsets'],
                                                    start + p, start + p + len(input_text))
                else:
                    _qas['answer_span'] = self.find_span_with_gt(_datum['context'],
                                                            _datum['raw_context_offsets'], input_text)
                long_question = ''
                for j in range(i - 2, i + 1):
                    if j < 0:
                        continue
                    long_question += ' ' + datum['questions'][j]['input_text']
                    if j < i:
                        long_question += ' ' + datum['answers'][j]['input_text']

                long_question = long_question.strip()       
                nlp_long_question = nlp(long_question)
                _qas['context_features'] = feature_gen(nlp_context, nlp_long_question)
                    
                _datum['qas'].append(_qas)
            data.append(_datum)

        # build vocabulary
        if dataset_label == 'train':
            print('Build vocabulary from training data...')
            contexts = [_datum['annotated_context']['word'] for _datum in data]
            qas = [qa['annotated_question']['word'] + qa['annotated_answer']['word'] for qa in _datum['qas'] for _datum in data]
            self.train_vocab = self.build_vocab(contexts, qas)
            self.train_char_vocab = self.build_char_vocab(self.train_vocab)

        print('Getting word ids...')
        w2id = {w: i for i, w in enumerate(self.train_vocab)}
        c2id = {c: i for i, c in enumerate(self.train_char_vocab)}
        for _datum in data:
            _datum['annotated_context']['wordid'] = token2id_sent(_datum['annotated_context']['word'], w2id, unk_id = 1, to_lower = False)
            _datum['annotated_context']['charid'] = char2id_sent(_datum['annotated_context']['word'], c2id, unk_id = 1, to_lower = False)
            for qa in _datum['qas']:
                qa['annotated_question']['wordid'] = token2id_sent(qa['annotated_question']['word'], w2id, unk_id = 1, to_lower = False)
                qa['annotated_question']['charid'] = char2id_sent(qa['annotated_question']['word'], c2id, unk_id = 1, to_lower = False)
                qa['annotated_answer']['wordid'] = token2id_sent(qa['annotated_answer']['word'], w2id, unk_id = 1, to_lower = False)
                qa['annotated_answer']['charid'] = char2id_sent(qa['annotated_answer']['word'], c2id, unk_id = 1, to_lower = False)

        if dataset_label == 'train':
            # get the condensed dictionary embedding
            print('Getting embedding matrix for ' + dataset_label)
            embedding = build_embedding(self.glove_file, self.train_vocab, self.glove_dim)
            meta = {'vocab': self.train_vocab, 'char_vocab': self.train_char_vocab, 'embedding': embedding.tolist()}
            meta_file_name = os.path.join(self.spacyDir, dataset_label + '_meta.msgpack')
            print('Saving meta information to', meta_file_name)
            with open(meta_file_name, 'wb') as f:
                msgpack.dump(meta, f, encoding='utf8')

        dataset['data'] = data

        if dataset_label == 'test':
            return dataset

        with open(output_file_name, 'w') as output_file:
            json.dump(dataset, output_file, sort_keys=True, indent=4)
Пример #3
0
    def preprocess(self, dataset_label):
        file_name = self.train_file if dataset_label == 'train' else (
            self.dev_file if dataset_label == 'dev' else self.test_file)
        output_file_name = os.path.join(
            self.spacyDir,
            self.data_prefix + dataset_label + '-preprocessed.json')

        print('Preprocessing', dataset_label, 'file:', file_name)
        print('Loading json...')
        with open(file_name, 'r') as f:
            dataset = json.load(f)

        print('Processing json...')
        count = 0
        data = []
        tot = len(dataset['data'])
        type1 = type2 = 0
        for data_idx in tqdm(range(tot)):
            datum = dataset['data'][data_idx]['paragraphs'][0]
            context_str = datum['context']
            _datum = {
                'context': context_str,
                'title': dataset['data'][data_idx]['title'],
                'id': data_idx
            }

            nlp_context = nlp(pre_proc(context_str))
            _datum['annotated_context'] = self.process(nlp_context)
            _datum['raw_context_offsets'] = self.get_raw_context_offsets(
                _datum['annotated_context']['word'], context_str)
            _datum['qas'] = []

            # assert len(datum['qas']['questions']) == len(datum['answers'])

            for i in range(len(datum['qas'])):
                question, answer = datum['qas'][i]['question'], datum['qas'][
                    i]['answers'][0]['text']
                # assert question['turn_id'] == answer['turn_id']
                count += 1
                idx = datum['qas'][i]['id']
                _qas = {'turn_id': idx, 'question': question, 'answer': answer}

                _qas['annotated_question'] = self.process(
                    nlp(pre_proc(question)))

                _qas['annotated_answer'] = self.process(nlp(pre_proc(answer)))
                _qas['raw_answer'] = answer
                _qas['answer_type'] = "extractive"
                _qas['answer_span_start'] = datum['qas'][i]['answers'][0][
                    'answer_start']
                _qas['answer_span_end'] = _qas['answer_span_start'] + len(
                    answer) + 1
                _qas['followup'] = datum['qas'][i]['followup']
                _qas['yesno'] = datum['qas'][i]['yesno']

                tmp = _qas['raw_answer']
                tmp = self.removePunctuation(tmp)
                if _qas['raw_answer'] in context_str or tmp.lower() in [
                        "yes", "no", "unknown"
                ]:
                    type1 += 1
                    _qas['answer_type'] = "extractive"
                else:
                    type2 += 1
                    _qas['answer_type'] = "generative"

                start = _qas['answer_span_start']  # rational 范围
                end = _qas['answer_span_end']
                chosen_text = _datum['context'][start:end].lower()
                while len(chosen_text) > 0 and chosen_text[
                        0] in string.whitespace:  # 判断开头的空白符 \t,\n等6种
                    chosen_text = chosen_text[1:]
                    start += 1
                while len(chosen_text) > 0 and chosen_text[
                        -1] in string.whitespace:  # 判断结尾的空白符
                    chosen_text = chosen_text[:-1]
                    end -= 1
                input_text = _qas['answer'].strip().lower()
                if input_text in chosen_text:
                    p = chosen_text.find(input_text)  # p:input_text的起始值
                    _qas['answer_span'] = self.find_span(
                        _datum['raw_context_offsets'], start + p,
                        start + p + len(input_text))
                else:
                    _qas['answer_span'] = self.find_span_with_gt(
                        _datum['context'], _datum['raw_context_offsets'],
                        input_text)

                _datum['qas'].append(_qas)

            data.append(_datum)

        # build vocabulary
        if dataset_label == 'train':
            print('Build vocabulary from training data...')
            contexts = [_datum['annotated_context']['word'] for _datum in data]
            qas = [
                qa['annotated_question']['word'] +
                qa['annotated_answer']['word'] for qa in _datum['qas']
                for _datum in data
            ]
            # self.train_vocab = self.build_vocab(contexts, qas)

        # print('Getting word ids...')
        # w2id = {w: i for i, w in enumerate(self.train_vocab)}
        # for _datum in data:
        #     _datum['annotated_context']['wordid'] = token2id_sent(_datum['annotated_context']['word'], w2id, unk_id=1,
        #                                                           to_lower=False)
        #     # new modify, get wordid
        #     for qa in _datum['qas']:
        #         qa['annotated_question']['wordid'] = token2id_sent(qa['annotated_question']['word'], w2id, unk_id=1,
        #                                                            to_lower=False)
        #         qa['annotated_answer']['wordid'] = token2id_sent(qa['annotated_answer']['word'], w2id, unk_id=1,
        #                                                          to_lower=False)

        # if dataset_label == 'train':
        #     # get the condensed dictionary embedding
        #     print('Getting embedding matrix for ' + dataset_label)
        #     embedding = build_embedding(self.glove_file, self.train_vocab, self.glove_dim)
        #     meta = {'vocab': self.train_vocab, 'embedding': embedding.tolist()}
        #     meta_file_name = os.path.join(self.spacyDir, dataset_label + '_meta.msgpack')
        #     print('Saving meta information to', meta_file_name)
        #     with open(meta_file_name, 'wb') as f:
        #         msgpack.dump(meta, f, encoding='utf8')

        dataset['data'] = data

        if dataset_label == 'test':
            return dataset

        with open(output_file_name, 'w') as output_file:
            json.dump(dataset, output_file, sort_keys=True, indent=4)
        print("The amount of extractive qa is: ", type1)
        print("The amount of generative qa is: ", type2)
        print("The amount of qas is: ", count)