def analysis_0(): data_iter = get_train() id_num = 0 eq_len = [] neq_len = [] avg_eq_len = [] avg_neq_len = [] for _, eqs_list, neqs_list in data_iter: id_num += 1 avg_eq_len.append(len(eqs_list)) avg_neq_len.append(len(neqs_list)) for eq in eqs_list: eq_len.append(len(eq)) for neq in neqs_list: neq_len.append(len(neq)) print('\rnum: {}'.format(id_num), end=' ') print('\nover.') eq_len = np.array(eq_len) neq_len = np.array(neq_len) avg_eq_len = np.array(avg_eq_len) avg_neq_len = np.array(avg_neq_len) print('eq_len: ') print(eq_len.mean(), eq_len.max(), eq_len.min()) print('neq_len: ') print(neq_len.mean(), neq_len.max(), neq_len.min()) print('avg_eq_len: ') print(avg_eq_len.mean(), avg_eq_len.max(), avg_eq_len.min()) print('avg_neq_len: ') print(avg_neq_len.mean(), avg_neq_len.max(), avg_neq_len.min()) """
def get_exist_words(): import jieba from aa_cfg import join, DATA_PATH import json jieba.load_userdict(join(DATA_PATH, 'token_freq.txt')) jieba.load_userdict(join(DATA_PATH, 'law_word.txt')) chars = dict() train_iter = get_train() for _d in train_iter: _id, eqs_list, neqs_list = _d d_list = eqs_list d_list.extend(neqs_list) for s in d_list: s_list = jieba.lcut(s) for w in s_list: chars[w] = chars.get(w, 0) + 1 chars = [(i, j) for i, j in chars.items() if j >= 10 and len(i) > 1] chars = sorted(chars, key=lambda c: -c[1]) chars = [c[0] for c in chars] json.dump(chars, open(join(DATA_PATH, 'chars.dict'), 'w', encoding='utf-8'), indent=4, ensure_ascii=False)
def get_line_text(): train_iter = get_train() for _d in train_iter: _id, eqs_list, neqs_list = _d d_list = eqs_list d_list.extend(neqs_list) for s in d_list: yield s[:ec_cfg.max_seq_len]
def simplify_vocab_dict(): import json chars = dict() min_count = 1 model_pre_save_path = join(MODEL_PATH, 'train_pre') if not os.path.isdir(model_pre_save_path): os.makedirs(model_pre_save_path) data = get_train() for _, pos, neg in data: for sentence in pos: for w in sentence: chars[w] = chars.get(w, 0) + 1 for sentence in neg: for w in sentence: chars[w] = chars.get(w, 0) + 1 chars = [(i, j) for i, j in chars.items() if j >= min_count] chars = sorted(chars, key=lambda c: -c[1]) chars = [c[0] for c in chars] json.dump(chars, open(join(model_pre_save_path, 'chars.dict'), 'w', encoding='utf-8'), indent=4, ensure_ascii=False) # checkpoint_path = os.path.join(main_path, 'model/bert/bert_model.ckpt') dict_path = os.path.join(DATA_PATH, 'bert_roberta/vocab.txt') _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) print('len of keep_words: ', len(keep_words)) joblib.dump(token_dict, join(model_pre_save_path, 'token_dict.joblib')) joblib.dump(keep_words, join(model_pre_save_path, 'keep_words.joblib'))
def split_data(split_n=MAX_FOLD): # 5个交叉验证集 print('read...') data = list(get_train()) print('shuffle...') random.shuffle(data) val_len = int(len(data) / split_n) for i in range(split_n): if i == split_n - 1: val = data[val_len * i:] else: val = data[val_len * i:val_len * (i + 1)] train = data[:val_len * i] train.extend(data[val_len * (i + 1):]) val_final = [] for d in val: val_final.append(trains_pairs(d, 2)) random.shuffle(val_final) print('save {}'.format(i)) joblib.dump(train, join(MID_PATH, 'train_{}.joblib'.format(i))) joblib.dump(val_final, join(MID_PATH, 'val_{}.joblib'.format(i)))