def get_bert_word_data(self, ds_fp, data_type): dataset = pio.load(ds_fp) print('通过bert来获取features') for qid, context, question, text, answer_start in tqdm( self.iter_cqa(dataset)): # if len(context) == 0 or len(question) == 0 or len(text) == 0: # print('bug is here') # break bert_encode_result = self.bc.encode([context, question, text], show_tokens=True) cembs = bert_encode_result[0][0] qembs = bert_encode_result[0][1] # qids = self.get_sent_word_ids(question, self.max_qlen) b, e = answer_start, answer_start + len(bert_encode_result[1][2]) if e >= len(cembs[0]): b = e = 0 yield qid, cembs, qembs, b, e
def build_wordset(self, f_path="data/vocab/words.txt"): if os.path.exists(f_path): self.wordset = load_pkl(f_path) else: for fp in self.datasets_fp: dataset = pio.load(fp) for _, context, question, answer, _ in self.iter_cqa(dataset): words = [] words.extend(nltk.word_tokenize(context)) words.extend(nltk.word_tokenize(question)) words.extend(nltk.word_tokenize(answer)) self.wordset |= set(words) self.wordset = [w for w in self.wordset if w not in stops] #去停用词 self.wordset = sorted(self.wordset) # 按字母训练排序(偷懒) self.wordset = ['[PAD]', '[CLS]', '[SEP]', '[UNK]'] + self.wordset dump_pkl(self.wordset, f_path) print(len(self.wordset)) idx = list(range(len(self.wordset))) self.word2id = dict(zip(self.wordset, idx)) self.id2word = dict(zip(idx, self.charset))
def get_word_data(self, ds_fp, data_type): dataset = pio.load(ds_fp) print('获取word级别的数据') return_list = [] if 'word_data_{}.npy'.format(data_type) in os.listdir('./cache'): print('已存在,读取word_data_{}..'.format(data_type)) return_list = np.load('./cache/word_data_{}.npy'.format(data_type), allow_pickle=True) return_list = return_list.tolist() print('读取word_data_{}完毕'.format(data_type)) else: for qid, context, question, text, answer_start in tqdm(self.iter_cqa(dataset)): cids = self.get_sent_word_ids(context, self.max_clen) qids = self.get_sent_word_ids(question, self.max_qlen) b, e = answer_start, answer_start + len(word_tokenize(text)) if e >= len(cids): b = e = 0 # yield qid, cids, qids, b, e return_list.append((qid, cids, qids, b, e)) tmp_word_list = np.array(return_list) np.save('./cache/word_data_{}.npy'.format(data_type), tmp_word_list) return return_list
def dataword_info(self, inn): contextlist = [] questionlist = [] answerlist = [] wordslist = [] dataset = pio.load(inn) for _, context, question, answer, _ in self.iter_cqa(dataset): # 通过正则去除标点,除'外 self.func_re(context) self.func_re(question) self.func_re(answer) # 使用nltk进行分词 contextlist.append(WordPunctTokenizer().tokenize(context)) questionlist.extend(WordPunctTokenizer().tokenize(question)) answerlist.extend(WordPunctTokenizer().tokenize(answer)) # 汇总到wordslist中 wordslist.extend(contextlist) wordslist.extend(questionlist) wordslist.extend(answerlist) return contextlist, questionlist, answerlist, wordslist
def get_Bdata(self, ds_fp): dataset = pio.load(ds_fp) for qid, context, question, text, answer_start in self.iter_cqa( dataset): new_content = context + "|||" + question c_seg_list = self.seg_text(context) q_seg_list = self.seg_text(question) b, e = answer_start, answer_start + len(text) nb = -1 ne = -1 len_all_char = 0 for i, w in enumerate(c_seg_list): if i == 0: continue if b > len_all_char - 1 and b <= len_all_char + len(w) - 1: b = i + 1 if e > len_all_char - 1 and e <= len_all_char + len(w) - 1: e = i + 1 len_all_char += len(w) if ne == -1: b = e = 0 yield qid, context, question, new_content, b, e
def get_data(self, ds_fp): # 08 从数据集中分类qid, context, question, text, answer_start dataset = pio.load(ds_fp) # 调用data_io加载数据