def label_mp(split): """ process the data split with multi-processing""" start = time() print('start processing {} split...'.format(split)) data_dir = join(DATA_DIR, split) n_data = count_data(data_dir) with mp.Pool() as pool: list(pool.imap_unordered(process(split), list(range(n_data)), chunksize=1024)) print('finished in {}'.format(timedelta(seconds=time()-start)))
def dump(split): start = time() print('start processing {} split...'.format(split)) data_dir = join(DATA_DIR, split) dump_dir = join(DATA_DIR, 'refs', split) n_data = count_data(data_dir) for i in range(n_data): print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data), end='') with open(join(data_dir, '{}.json'.format(i))) as f: data = json.loads(f.read()) abs_sents = data['abstract'] with open(join(dump_dir, '{}.ref'.format(i)), 'w') as f: f.write(make_html_safe('\n'.join(abs_sents))) print('finished in {}'.format(timedelta(seconds=time()-start)))
def label(split): start = time() print('start processing {} split...'.format(split)) data_dir = join(DATA_DIR, split) n_data = count_data(data_dir) for i in range(n_data): print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data), end='') with open(join(data_dir, '{}.json'.format(i))) as f: data = json.loads(f.read()) tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) abs_sents = tokenize(data['abstract']) extracted, scores = get_extract_label(art_sents, abs_sents) data['extracted'] = extracted data['score'] = scores with open(join(data_dir, '{}.json'.format(i)), 'w') as f: json.dump(data, f, indent=4) print('finished in {}'.format(timedelta(seconds=time()-start)))
def label(split): start = time() print('start processing {} split...'.format(split)) data_dir = join(DATA_DIR, split) n_data = count_data(data_dir) for i in range(n_data): print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100 * i / n_data), end='') with open(join(data_dir, '{}.json'.format(i)), encoding='utf-8') as f: data = json.loads(f.read(), encoding='utf-8') tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) abs_sents = tokenize(data['abstract']) extracted, scores = get_extract_label(art_sents, abs_sents) data['extracted'] = extracted data['score'] = scores with open(join(data_dir, '{}.json'.format(i)), 'w', encoding='utf-8') as f: json.dump(data, f, indent=4) print('finished in {}'.format(timedelta(seconds=time() - start)))
def __init__(self): self._path = join(DATA_DIR, 'train') self._n_data = count_data(self._path)
def main(args): print('no use bert') os.makedirs(AFTER_DIR) # meta = json.load(open(join(DATA_DIR, 'meta.json'))) # nargs = meta['net_args'] # ckpt = load_best_ckpt(DATA_DIR) # net = BertMatcher(**nargs) # net.load_state_dict(ckpt) # if args.cuda: # net = net.cuda() # net.eval() # tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain') stopwords = stopwordlist() context_path = 'data/class/context' context_data = count_data(context_path) corpus = [] new_docid_arr = [] for i in range(context_data): with open(join('data/class/context', '{}.json'.format(i + 1))) as f: js_data = json.load(f) text = filter_text(js_data['text'].replace(' ', '').replace( ' ', '').replace('&rbsp;', '').replace('&mbsp;', '')) new_docid = js_data['new_docid'] data = list(jieba.lcut(filter_text(text), cut_all=False, HMM=True)) remove = lambda token: False if token in stopwords else True data = list(filter(remove, data)) print(new_docid) corpus.append(data) new_docid_arr.append(new_docid) dictionary = corpora.Dictionary(corpus) bm25Model = bm25.BM25(corpus) with torch.no_grad(): for index in range(1643): with open( join(join('data/final', 'original_test_sample'), '{}.json'.format(index + 1))) as f: js_data = json.load(f) print('loading: {}'.format(index + 1)) id, question_text, ques_id = (js_data['id'], js_data['question'], js_data['question_id']) remove = lambda token: False if token in stopwords else True q_data = list( jieba.lcut(filter_text(question_text), cut_all=False, HMM=True)) q_data = list(filter(remove, q_data)) scores = bm25Model.get_scores(q_data) max_num_index_list = map(scores.index, heapq.nlargest(10, scores)) max_num_index_list = list(max_num_index_list) arr = [] for m in max_num_index_list: idx = m fname = new_docid_arr[idx] arr.append(fname) new_corpus = [] new_new_docid_arr = [] for con in arr: with open( join(join(DATASET_DIR, 'context'), '{}.json'.format(con))) as c: cn_data = json.load(c) co_docid, docid, text = (cn_data['new_docid'], cn_data['docid'], cn_data['text']) data = list( jieba.lcut(filter_text(text), cut_all=False, HMM=True)) remove = lambda token: False if token in stopwords else True data = list(filter(remove, data)) new_corpus.append(data) new_new_docid_arr.append(co_docid) new_bm25Model = bm25.BM25(new_corpus) new_scores = new_bm25Model.get_scores(q_data) max_num_index_list = map(new_scores.index, heapq.nlargest(1, new_scores)) max_num_index_list = list(max_num_index_list) final_docid = new_new_docid_arr[max_num_index_list[0]] with open(join('data/class/context', '{}.json'.format(final_docid))) as l: cn_data = json.load(l) f_new_docid, f_docid, f_text = (cn_data['new_docid'], cn_data['docid'], cn_data['text']) # text_tok = tokenizer.tokenize(text) # text_id = tokenizer.convert_tokens_to_ids(text_tok) # text_len = len(text_id) # # question_len = len(ques_id) # if (question_len + text_len <= 512): # concat_text=ques_id+text_id # # # token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize([concat_text], args.cuda) # # fw_args = (token_tensor, segment_tensor, mask_tensor) # net_out = net(*fw_args) # # if (net_out[0][0].item() > highest_score[-1]) : # highest_score.clear() # highest_score.append(net_out[0][0].item()) # context_new_id.clear() # context_new_id.append(new_docid) # context_id.clear() # context_id.append(docid) # context_content.clear() # context_content.append(text) # # else: # sp = 0 # ep = 412 # scores_arr=[] # while (True): # if (ep >= text_len and sp < text_len): # sub_text = text_id[sp:text_len] # concat_text = ques_id + sub_text # token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize([concat_text], args.cuda) # # fw_args = (token_tensor, segment_tensor, mask_tensor) # net_out = net(*fw_args) # scores_arr.append(net_out[0][0].item()) # sp += 312 # ep += 312 # else: # if (ep > text_len): # break # else: # sub_text = text_id[sp:ep] # concat_text = ques_id + sub_text # token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize([concat_text], # args.cuda) # # fw_args = (token_tensor, segment_tensor, mask_tensor) # net_out = net(*fw_args) # scores_arr.append(net_out[0][0].item()) # sp += 312 # ep += 312 # if (max(scores_arr)>highest_score[-1]): # highest_score.clear() # highest_score.append(net_out[0][0].item()) # context_new_id.clear() # context_new_id.append(new_docid) # context_id.clear() # context_id.append(docid) # context_content.clear() # context_content.append(text) tmp_dict = {} tmp_dict['index'] = index + 1 tmp_dict['id'] = id tmp_dict['question'] = question_text tmp_dict['new_docid'] = final_docid tmp_dict['docid'] = f_docid tmp_dict['text'] = f_text with open(join(AFTER_DIR, '{}.json'.format(index + 1)), 'w', encoding='utf-8') as p: json.dump(tmp_dict, p, ensure_ascii=False) print('finish processing {}'.format(index + 1))
def main(args): print('./MRC_pretrain') os.makedirs(AFTER_DIR) meta = json.load(open(join(DATA_DIR, 'meta.json'))) nargs = meta['net_args'] ckpt = load_best_ckpt(DATA_DIR) net = BertMatcher(**nargs) net.load_state_dict(ckpt) if args.cuda: net = net.cuda() net.eval() tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain') stopwords = stopwordlist() context_path = 'data/class/context' context_data = count_data(context_path) corpus = [] new_docid_arr = [] for i in range(context_data): with open(join('data/class/context', '{}.json'.format(i + 1))) as f: js_data = json.load(f) text = filter_text(js_data['text'].replace(' ', '').replace( ' ', '').replace('&rbsp;', '').replace('&mbsp;', '')) new_docid = js_data['new_docid'] data = list(jieba.lcut(filter_text(text), cut_all=False, HMM=True)) remove = lambda token: False if token in stopwords else True data = list(filter(remove, data)) print(new_docid) corpus.append(data) new_docid_arr.append(new_docid) dictionary = corpora.Dictionary(corpus) bm25Model = bm25.BM25(corpus) with torch.no_grad(): for index in range(1643): with open( join(join('data/final', 'original_test_sample'), '{}.json'.format(index + 1))) as f: js_data = json.load(f) print('loading: {}'.format(index + 1)) id, question_text, ques_id = (js_data['id'], js_data['question'], js_data['question_id']) remove = lambda token: False if token in stopwords else True q_data = list( jieba.lcut(filter_text(question_text), cut_all=False, HMM=True)) q_data = list(filter(remove, q_data)) scores = bm25Model.get_scores(q_data) max_num_index_list = map(scores.index, heapq.nlargest(5, scores)) max_num_index_list = list(max_num_index_list) arr = [] for m in max_num_index_list: idx = m fname = new_docid_arr[idx] arr.append(fname) highest_score = [] context_new_id = [] context_id = [] context_content = [] for con in arr: with open( join(join(DATASET_DIR, 'context'), '{}.json'.format(con))) as c: cn_data = json.load(c) new_docid, docid, text = (cn_data['new_docid'], cn_data['docid'], cn_data['text']) text_tok = tokenizer.tokenize(text) text_id = tokenizer.convert_tokens_to_ids(text_tok) text_len = len(text_id) question_len = len(ques_id) if (question_len + text_len <= 512): concat_text = ques_id + text_id token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize( [concat_text], args.cuda) fw_args = (token_tensor, segment_tensor, mask_tensor) net_out = net(*fw_args) # if (net_out[0][0].item() > highest_score[-1]) : if (True): highest_score.append(net_out[0][0].item()) context_new_id.append(new_docid) context_id.append(docid) context_content.append(text) else: sp = 0 ep = 412 scores_arr = [] while (True): if (ep >= text_len and sp < text_len): sub_text = text_id[sp:text_len] concat_text = ques_id + sub_text token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize( [concat_text], args.cuda) fw_args = (token_tensor, segment_tensor, mask_tensor) net_out = net(*fw_args) # scores_arr.append(net_out[0][0].item()) output = '' text_tok_arr = text_tok[sp:text_len] for tok in text_tok_arr: output += tok if (tok != '[UNK]') else '' output = output.replace('##', '') print(output) highest_score.append(net_out[0][0].item()) context_new_id.append(new_docid) context_id.append(docid) context_content.append(output) sp += 312 ep += 312 else: if (ep > text_len): break else: sub_text = text_id[sp:ep] concat_text = ques_id + sub_text token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize( [concat_text], args.cuda) fw_args = (token_tensor, segment_tensor, mask_tensor) net_out = net(*fw_args) # scores_arr.append(net_out[0][0].item()) output = '' text_tok_arr = text_tok[sp:text_len] for tok in text_tok_arr: output += tok if (tok != '[UNK]') else '' output = output.replace('##', '') print(output) highest_score.append(net_out[0][0].item()) context_new_id.append(new_docid) context_id.append(docid) context_content.append(output) sp += 312 ep += 312 # if (max(scores_arr)>highest_score[-1]): ranking_index = map(highest_score.index, heapq.nlargest(5, highest_score)) ranking_index = list(ranking_index) fi = '' for cnm in ranking_index: fi += context_content[cnm] tmp_dict = {} tmp_dict['index'] = index + 1 tmp_dict['id'] = id tmp_dict['question'] = question_text tmp_dict['new_docid'] = context_new_id[0] tmp_dict['docid'] = context_id[0] tmp_dict['text'] = fi with open(join(AFTER_DIR, '{}.json'.format(index + 1)), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f, ensure_ascii=False) print('finish processing {}'.format(index + 1))
def process_positive_example_v1(): stopwords = stopwordlist() os.makedirs('data/class/pos') os.makedirs('data/class/neg') context_path = 'data/class/context' context_data = count_data(context_path) corpus = [] new_docid_arr = [] for i in range(context_data): with open(join('data/class/context', '{}.json'.format(i + 1))) as f: js_data = json.load(f) text = filter_text( js_data['text'].replace(' ', '').replace(' ', '').replace('&rbsp;', '').replace('&mbsp;', '')) new_docid = js_data['new_docid'] data = list(jieba.lcut(filter_text(text), cut_all=False, HMM=True)) remove = lambda token: False if token in stopwords else True data = list(filter(remove, data)) print(new_docid) corpus.append(data) new_docid_arr.append(new_docid) dictionary = corpora.Dictionary(corpus) bm25Model = bm25.BM25(corpus) csv_reader = csv.reader(open(TRAIN_DIR), delimiter='\t') rows = [row for row in csv_reader] docid_name = rows[0][1] question_name = rows[0][2] answer_name = rows[0][3] json_positive_dirs = join(CLASSIFICATION_DIR, 'positive_sample') # if not exists(json_positive_dirs): # os.makedirs(json_positive_dirs) # print('Dir used for positive samples Created ') with open(REALATE_DIR,'rb') as v: relation_dict=pickle.load(v) sample_rows = rows[:1582] + rows[1583:1955] + rows[1956:3781] + rows[3782:] maxlen=0 count=1 ncount=1 right=0 for i, sample_raw in enumerate(sample_rows): print('loading {}'.format(i)) if (i == 0): continue else: tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain') new_docid=relation_dict[sample_raw[1]] remove = lambda token: False if token in stopwords else True question = filter_text(sample_raw[2].replace(' ', '').replace(' ', '')) q_data = list(jieba.lcut(filter_text(question), cut_all=False, HMM=True)) q_data = list(filter(remove, q_data)) scores = bm25Model.get_scores(q_data) max_num_index_list = map(scores.index, heapq.nlargest(3, scores)) max_num_index_list = list(max_num_index_list) arr = [] for m in max_num_index_list: idx = m fname = new_docid_arr[idx] arr.append(fname) if (not(new_docid in arr)): continue else: right+=1 for con in arr: with open(join('data/class/context', '{}.json'.format(con))) as c: cn_data = json.load(c) cont_docid, docid, text = (cn_data['new_docid'], cn_data['docid'], cn_data['text']) ques_tok = tokenizer.tokenize("[CLS] " + question + " [SEP]") ques_id = tokenizer.convert_tokens_to_ids(ques_tok) question_len = len(ques_id) text_tok = tokenizer.tokenize(text) text_id = tokenizer.convert_tokens_to_ids(text_tok) text_len = len(text_id) if (con == new_docid): tmp_dict = {} tmp_dict['is_related'] = 1 tmp_dict['new_docid'] = new_docid answer = filter_text(sample_raw[3].replace(' ', '').replace(' ', '')) ans_tok = tokenizer.tokenize(answer) ans_id = tokenizer.convert_tokens_to_ids(ans_tok) ans_len = len(ans_id) suppose_start = [] # 可能的start位置 for i in range(text_len): if (text_id[i] == ans_id[0]): suppose_start.append(i) s = 0 e = 0 if (len(suppose_start) <= 0): continue else: for t in range(len(suppose_start)): start = suppose_start[t] end = suppose_start[t] for m in range(ans_len): if (m + start >= text_len): break elif (ans_id[m] == text_id[m + start]): end += 1 else: break if (end - start != ans_len): continue else: s = suppose_start[t] e = end break if (s == 0 and e == 0): continue else: span_arr = [0] * (s - 0) + [1] * (e - s) + [0] * (text_len - e) if (question_len + text_len <= 512): tmp_dict['question'] = ques_id tmp_dict['text'] = text_id with open(join('data/class/pos', '{}.json'.format(count)), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f, ensure_ascii=False) count += 1 else: sp = 0 ep = 412 assert question_len <= 100 and text_len >= 412 while (True): if (ep >= text_len and sp < text_len): sub_text = text_id[sp:text_len] tmp_dict['question'] = ques_id tmp_dict['text'] = sub_text assert question_len + text_len - sp <= 512 with open(join('data/class/pos', '{}.json'.format(count)), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f, ensure_ascii=False) count += 1 sp += 312 ep += 312 # else: # break else: if (ep > text_len): break else: sub_text = text_id[sp:ep] tmp_dict['question'] = ques_id tmp_dict['text'] = sub_text assert question_len + ep - sp <= 512 with open(join('data/class/pos', '{}.json'.format(count)), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f, ensure_ascii=False) count += 1 sp += 312 ep += 312 else: tmp_dict['is_related'] = 0 tmp_dict['new_docid'] = con tmp_dict['question'] = ques_id if (question_len + text_len <= 512): tmp_dict['text'] = text_id with open(join('data/class/neg', '{}.json'.format(ncount)), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f, ensure_ascii=False) ncount += 1 else: sp = 0 ep = 412 assert question_len <= 100 and text_len >= 412 while (True): if (ep >= text_len and sp < text_len): sub_text = text_id[sp:text_len] tmp_dict['question'] = ques_id tmp_dict['text'] = sub_text assert question_len + text_len - sp <= 512 with open(join('data/class/neg', '{}.json'.format(ncount)), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f, ensure_ascii=False) ncount += 1 sp += 312 ep += 312 # else: # break else: if (ep > text_len): break else: sub_text = text_id[sp:ep] tmp_dict['question'] = ques_id tmp_dict['text'] = sub_text assert question_len + ep - sp <= 512 with open(join('data/class/neg', '{}.json'.format(ncount)), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f, ensure_ascii=False) ncount += 1 sp += 312 ep += 312 print('Pre-processed {} positive samples finished'.format(right)) print(len(sample_rows))
def __init__(self, data_dir): self._path = os.path.join(data_dir, 'train') self._n_data = count_data(self._path)