def get_the_final_data(): """ 获得最终版本的数据,数据格式为[q11,q21,q12,q22,label] :return: """ save_data_dir = "./final_data/final_data_2/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) cilinpath = "./cilin.txt" file_path_json = "./dataset/train_set.json" same_pinyin_file = "./same_pinyin.txt" chinese_word_freq_file = "./chinese-words.txt" save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt" data, true_data, false_data = read_data(file_path_json) data_eva = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file) # 进行数据增强 new_data = data_eva + data # 包含增强后的数据集 final_data,pattern_data = get_data_pattern(new_data) all_data, train, dev, test = data_inverse(final_data) all_data_path_txt = save_data_dir + "train_set.txt" train_path_txt = save_data_dir + "train.txt" test_path_txt = save_data_dir + "test.txt" dev_path_txt = save_data_dir + "dev.txt" save_data(all_data,all_data_path_txt) save_data(train,train_path_txt) save_data(test,test_path_txt) save_data(dev,dev_path_txt) # 生成测试集 dev_csv_path = "./dataset/dev_set.csv" dev_txt_path = save_data_dir + "dev_set.txt" dev = read_data(dev_csv_path,dev=True) dev_data, pattern_dev = get_data_pattern(dev,dev=True) save_data(dev_data, dev_txt_path, columns_num=4)
def compare_the_same_q_in_train_and_dev(train_path,dev_path,stopword_path,otherword_path): """ 计算测试集和训练集中相同问题的数目 :param train_path: 训练集路径 :param dev_path: 测试集路径 :param stopword_path: 停用词路径 :param otherword_path: 其他要删掉的词,如语气词等,及请问,咨询一下...... :return: """ stopwords = stopwordList(stopword_path) otherword = stopwordList(otherword_path) # stopwords += otherword train_data, train_true_data, train_false_data = read_data(train_path) train_data = [[sample[0],sample[1]] for sample in train_data] train_sample = spread_list(train_data) # 只获取文本部分内容,不获取标签部分内容 train_question = [] for i in tqdm(range(len(train_sample))): train_question.append(seg_depart(train_sample[i],stopwords,otherword)) dev_data = read_data(dev_path,dev=True) dev_sample = spread_list(dev_data) dev_question = [] for i in tqdm(range(len(dev_sample))): dev_question.append(seg_depart(dev_sample[i],stopwords,otherword)) same_number = 0 # 记录测试集和训练集中相同问题的数目 for question in tqdm(dev_question): if question in train_question: same_number += 1 else: continue return same_number
def get_the_final_data_4(dev_samples=-5000): """ 获得最终版本的数据,数据格式为[q11,q21,q12,q22,q31,label],并从原始训练集里切5000条数据作为测试集 数据增强加到了10000条 q12:q1中与q2不同的词汇 q22:q2中与q1不同的词汇 q31:q1与q2相同的词汇 :return: """ save_data_dir = "./final_data/final_data_6/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) cilinpath = "./cilin.txt" file_path_json = "./dataset/train_set.json" same_pinyin_file = "./same_pinyin.txt" chinese_word_freq_file = "./chinese-words.txt" save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt" data, true_data, false_data = read_data(file_path_json) data_eva = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file,portition=0.1) # 进行数据增强 new_data = data_eva + data # 包含增强后的数据集 print(len(new_data)) all_train_data,all_train_data_pattern = get_data_pattern(new_data,mode=2) all_train_data,all_train_train,all_train_dev,all_train_test = data_inverse(all_train_data,mode=2) print(len(all_train_data)) dev_data_from_train = new_data[dev_samples:] # 从原始数据集里切500条数据出来作为验证集 new_data = new_data[0:dev_samples] # 剩下的数据作为训练集 final_data,pattern_data = get_data_pattern(new_data,mode=2) all_data, train, dev, test = data_inverse(final_data,mode=2) dev_data_from_train_1,dev_data_from_train_pattern = get_data_pattern(dev_data_from_train,mode=2) all_train_data_path = save_data_dir + "all_train_data.txt" all_data_path_txt = save_data_dir + "train_set.txt" train_path_txt = save_data_dir + "train.txt" test_path_txt = save_data_dir + "test.txt" dev_path_txt = save_data_dir + "dev.txt" dev_from_train_path_txt = save_data_dir + "dev_split.txt" save_data(all_train_data,all_train_data_path,columns_num=6) save_data(all_data,all_data_path_txt,columns_num=6) save_data(train,train_path_txt,columns_num=6) save_data(test,test_path_txt,columns_num=6) save_data(dev,dev_path_txt,columns_num=6) save_data(dev_data_from_train_1,dev_from_train_path_txt,columns_num=6) # 生成测试集 dev_csv_path = "./dataset/dev_set.csv" dev_txt_path = save_data_dir + "dev_set.txt" dev = read_data(dev_csv_path,dev=True) dev_data, pattern_dev = get_data_pattern(dev,dev=True,mode=2) save_data(dev_data, dev_txt_path, columns_num=5)
def get_same_sample_in_train_and_test(): """计算测试集和训练集中相同样本的数目""" train_path = "./train_set.json" dev_path = "./test_set.csv" stopword_path = "./stopwords.txt" otherword_path = "./otherwords.txt" save_data_dir = "./tongji/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) save_same_sample_path = "./tongji/same_sample_in_train_and_test.csv" stopwords = stopwordList(stopword_path) otherword = stopwordList(otherword_path) # stopwords += otherword train_data, train_true_data, train_false_data = read_data(train_path) dev_data = read_data(dev_path, dev=True) same_sample = compare_same_sample_in_train_and_test(train_data, dev_data, save_same_sample_path, stopwords,otherword)
def get_the_final_data_5(dev_samples=-5000): """ 获得最终版本的数据,数据格式为[q1,q2,label],并从原始训练集里切5000条数据作为测试集 注意:只包含增强后的数据,不包含原始数据 :return: """ save_data_dir = "./final_data/final_data_9/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) cilinpath = "./cilin.txt" file_path_json = "./dataset/train_set.json" same_pinyin_file = "./same_pinyin.txt" chinese_word_freq_file = "./chinese-words.txt" save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt" data, true_data, false_data = read_data(file_path_json) data_eva = synword_and_samepinyin_data(data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file,portition=1) # 进行数据增强 # new_data = data_eva + data # 包含增强后的数据 new_data = data_eva all_train_data,all_train_data_1,all_train_data_2,all_train_dat_3 = data_inverse(new_data,pattern=False) dev_data_from_train = new_data[dev_samples:] # 从原始数据集里切500条数据出来作为验证集 new_data = new_data[0:dev_samples] # 剩下的数据作为训练集 all_data, train, dev, test = data_inverse(new_data,pattern=False) # dev_data_from_train_1, dev_data_from_train_pattern = get_data_pattern(dev_data_from_train) all_train_data_path = save_data_dir + "all_train_data.txt" all_data_path_txt = save_data_dir + "train_set.txt" train_path_txt = save_data_dir + "train.txt" test_path_txt = save_data_dir + "test.txt" dev_path_txt = save_data_dir +"dev.txt" dev_from_train_path_txt = save_data_dir + "dev_split.txt" save_data(all_train_data,all_train_data_path,columns_num=3) save_data(all_data,all_data_path_txt,columns_num=3) save_data(train,train_path_txt,columns_num=3) save_data(test,test_path_txt,columns_num=3) save_data(dev,dev_path_txt,columns_num=3) save_data(dev_data_from_train, dev_from_train_path_txt,columns_num=3) # 生成测试集 dev_csv_path = "./dataset/test_set.csv" dev_txt_path = save_data_dir +"test_set.txt" dev = read_data(dev_csv_path,dev=True) save_data_synwords_and_samepinyin_for_dev = save_data_dir + "data_replace_by_synwords_and_samepinyin_for_dev.txt" data_eva = synword_and_samepinyin_data(dev, save_data_synwords_and_samepinyin_for_dev, cilinpath, same_pinyin_file, chinese_word_freq_file, columns_num=2,portition=1) # 进行数据增强 # dev_data, pattern_dev = get_data_pattern(dev,dev=True) save_data(dev, dev_txt_path, columns_num=2)
def get_the_final_data_3(dev_samples=-5000): """ 获得最终版本的数据,数据格式为[q11,q21,q12,q22,label],并从原始训练集里切5000条数据作为测试集,pattern为使用通配符替换相同词汇 :return: """ save_data_dir = "./final_data/final_data_10/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) cilinpath = "./cilin.txt" file_path_json = "./dataset/train_set.json" same_pinyin_file = "./same_pinyin.txt" chinese_word_freq_file = "./chinese-words.txt" save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt" data, true_data, false_data = read_data(file_path_json) data_eva_true = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file,portition=0.2) # 进行数据增强 data_eva_false = synword_and_samepinyin_data(false_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file, portition=0.3) # 进行数据增强 new_data = data_eva_true + data_eva_false + data # 包含增强后的数据集 all_train_data,all_train_data_pattern = get_data_pattern(new_data) all_train_data,all_train_train,all_train_dev,all_train_test = data_inverse(all_train_data) dev_data_from_train = new_data[dev_samples:] # 从原始数据集里切500条数据出来作为验证集 new_data = new_data[0:dev_samples] # 剩下的数据作为训练集 final_data,pattern_data = get_data_pattern(new_data) all_data, train, dev, test = data_inverse(final_data) dev_data_from_train_1,dev_data_from_train_pattern = get_data_pattern(dev_data_from_train) all_train_data_path = save_data_dir + "all_train_data.txt" all_data_path_txt = save_data_dir + "train_set.txt" train_path_txt = save_data_dir + "train.txt" test_path_txt = save_data_dir + "test.txt" dev_path_txt = save_data_dir + "dev.txt" dev_from_train_path_txt = save_data_dir + "dev_split.txt" save_data(all_train_data, all_train_data_path) save_data(all_data,all_data_path_txt) save_data(train,train_path_txt) save_data(test,test_path_txt) save_data(dev,dev_path_txt) save_data(dev_data_from_train_1,dev_from_train_path_txt) # 生成测试集 dev_csv_path = "./dataset/test_set.csv" dev_txt_path = save_data_dir + "test_set.txt" dev = read_data(dev_csv_path,dev=True) dev_data, pattern_dev = get_data_pattern(dev,dev=True) save_data(dev_data, dev_txt_path, columns_num=4)
def remove_stopwords_sample(): """ 去掉停用词和礼貌用语,如,请问,谢谢了,之后的数据集,(train,test) :return: 输出去掉停用词之后得训练集和测试集 """ train_path = "./train_set.json" dev_path = "./test_set.csv" stopword_path = "./stopwords.txt" otherword_path = "./otherwords.txt" save_data_dir = "./tongji/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) remove_stopwords_train_path = save_data_dir + "remove_stopwords_train_set.txt" remove_stopwords_test_path = save_data_dir + "remove_stopwords_test_set.txt" stopwords = stopwordList(stopword_path) otherword = stopwordList(otherword_path) # stopwords += otherword train_data, train_true_data, train_false_data = read_data(train_path) dev_data = read_data(dev_path, dev=True) remove_stopwords_train = [] for sample in tqdm(train_data): q1 = seg_depart(sample[0],stopwords,otherword) q2 = seg_depart(sample[1],stopwords,otherword) remove_stopwords_train.append([q1,q2,sample[2]]) remove_stopwords_test = [] for sample in tqdm(dev_data): q1 = seg_depart(sample[0], stopwords,otherword) q2 = seg_depart(sample[1], stopwords,otherword) remove_stopwords_test.append([q1, q2]) if not os.path.exists(remove_stopwords_train_path) and not os.path.exists(remove_stopwords_test_path): save_data(remove_stopwords_train, remove_stopwords_train_path, columns_num=3) save_data(remove_stopwords_test,remove_stopwords_test_path,columns_num=2) return remove_stopwords_train, remove_stopwords_test
def the_average_length_of_question_in_train_dataset(): """ 统计训练集中,每个问题的平均长度 :return: """ train_path = "./train_set.json" train_data, train_true_data, train_false_data = read_data(train_path) all_length = 0 all_question = 0 for sample in train_data: all_length += len(sample[0]) all_length += len(sample[1]) all_question += 2 average_length_question = all_length/all_question print("the_average_length_of_question_in_train_dataset is {}".format(average_length_question)) return average_length_question