示例#1
0
    def get_bert_word_data(self, ds_fp, data_type):
        dataset = pio.load(ds_fp)
        print('通过bert来获取features')

        for qid, context, question, text, answer_start in tqdm(
                self.iter_cqa(dataset)):
            # if len(context) == 0 or len(question) == 0 or len(text) == 0:
            #     print('bug is here')
            #     break
            bert_encode_result = self.bc.encode([context, question, text],
                                                show_tokens=True)
            cembs = bert_encode_result[0][0]
            qembs = bert_encode_result[0][1]
            # qids = self.get_sent_word_ids(question, self.max_qlen)
            b, e = answer_start, answer_start + len(bert_encode_result[1][2])
            if e >= len(cembs[0]):
                b = e = 0
            yield qid, cembs, qembs, b, e
示例#2
0
 def build_wordset(self, f_path="data/vocab/words.txt"):
     if os.path.exists(f_path):
         self.wordset = load_pkl(f_path)
     else:
         for fp in self.datasets_fp:
             dataset = pio.load(fp)
             for _, context, question, answer, _ in self.iter_cqa(dataset):
                 words = []
                 words.extend(nltk.word_tokenize(context))
                 words.extend(nltk.word_tokenize(question))
                 words.extend(nltk.word_tokenize(answer))
                 self.wordset |= set(words)
         self.wordset = [w for w in self.wordset if w not in stops]  #去停用词
         self.wordset = sorted(self.wordset)  # 按字母训练排序(偷懒)
         self.wordset = ['[PAD]', '[CLS]', '[SEP]', '[UNK]'] + self.wordset
         dump_pkl(self.wordset, f_path)
     print(len(self.wordset))
     idx = list(range(len(self.wordset)))
     self.word2id = dict(zip(self.wordset, idx))
     self.id2word = dict(zip(idx, self.charset))
示例#3
0
 def get_word_data(self, ds_fp, data_type):
     dataset = pio.load(ds_fp)
     print('获取word级别的数据')
     return_list = []
     if 'word_data_{}.npy'.format(data_type) in os.listdir('./cache'):
         print('已存在,读取word_data_{}..'.format(data_type))
         return_list = np.load('./cache/word_data_{}.npy'.format(data_type), allow_pickle=True)
         return_list = return_list.tolist()
         print('读取word_data_{}完毕'.format(data_type))
     else:
         for qid, context, question, text, answer_start in tqdm(self.iter_cqa(dataset)):
             cids = self.get_sent_word_ids(context, self.max_clen)
             qids = self.get_sent_word_ids(question, self.max_qlen)
             b, e = answer_start, answer_start + len(word_tokenize(text))
             if e >= len(cids):
                 b = e = 0
             # yield qid, cids, qids, b, e
             return_list.append((qid, cids, qids, b, e))
         tmp_word_list = np.array(return_list)
         np.save('./cache/word_data_{}.npy'.format(data_type), tmp_word_list)
     
     return return_list
示例#4
0
    def dataword_info(self, inn):
        contextlist = []
        questionlist = []
        answerlist = []
        wordslist = []
        dataset = pio.load(inn)

        for _, context, question, answer, _ in self.iter_cqa(dataset):
            # 通过正则去除标点,除'外
            self.func_re(context)
            self.func_re(question)
            self.func_re(answer)
            # 使用nltk进行分词
            contextlist.append(WordPunctTokenizer().tokenize(context))
            questionlist.extend(WordPunctTokenizer().tokenize(question))
            answerlist.extend(WordPunctTokenizer().tokenize(answer))

            # 汇总到wordslist中
            wordslist.extend(contextlist)
            wordslist.extend(questionlist)
            wordslist.extend(answerlist)

        return contextlist, questionlist, answerlist, wordslist
示例#5
0
    def get_Bdata(self, ds_fp):
        dataset = pio.load(ds_fp)

        for qid, context, question, text, answer_start in self.iter_cqa(
                dataset):
            new_content = context + "|||" + question
            c_seg_list = self.seg_text(context)
            q_seg_list = self.seg_text(question)
            b, e = answer_start, answer_start + len(text)
            nb = -1
            ne = -1
            len_all_char = 0
            for i, w in enumerate(c_seg_list):
                if i == 0:
                    continue
                if b > len_all_char - 1 and b <= len_all_char + len(w) - 1:
                    b = i + 1
                if e > len_all_char - 1 and e <= len_all_char + len(w) - 1:
                    e = i + 1
                len_all_char += len(w)

            if ne == -1:
                b = e = 0
            yield qid, context, question, new_content, b, e
示例#6
0
 def get_data(self, ds_fp):
     # 08 从数据集中分类qid, context, question, text, answer_start
     dataset = pio.load(ds_fp)   # 调用data_io加载数据