def get_the_final_data(): """ 获得最终版本的数据,数据格式为[q11,q21,q12,q22,label] :return: """ save_data_dir = "./final_data/final_data_2/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) cilinpath = "./cilin.txt" file_path_json = "./dataset/train_set.json" same_pinyin_file = "./same_pinyin.txt" chinese_word_freq_file = "./chinese-words.txt" save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt" data, true_data, false_data = read_data(file_path_json) data_eva = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file) # 进行数据增强 new_data = data_eva + data # 包含增强后的数据集 final_data,pattern_data = get_data_pattern(new_data) all_data, train, dev, test = data_inverse(final_data) all_data_path_txt = save_data_dir + "train_set.txt" train_path_txt = save_data_dir + "train.txt" test_path_txt = save_data_dir + "test.txt" dev_path_txt = save_data_dir + "dev.txt" save_data(all_data,all_data_path_txt) save_data(train,train_path_txt) save_data(test,test_path_txt) save_data(dev,dev_path_txt) # 生成测试集 dev_csv_path = "./dataset/dev_set.csv" dev_txt_path = save_data_dir + "dev_set.txt" dev = read_data(dev_csv_path,dev=True) dev_data, pattern_dev = get_data_pattern(dev,dev=True) save_data(dev_data, dev_txt_path, columns_num=4)
def remove_stopwords_sample(): """ 去掉停用词和礼貌用语,如,请问,谢谢了,之后的数据集,(train,test) :return: 输出去掉停用词之后得训练集和测试集 """ train_path = "./train_set.json" dev_path = "./test_set.csv" stopword_path = "./stopwords.txt" otherword_path = "./otherwords.txt" save_data_dir = "./tongji/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) remove_stopwords_train_path = save_data_dir + "remove_stopwords_train_set.txt" remove_stopwords_test_path = save_data_dir + "remove_stopwords_test_set.txt" stopwords = stopwordList(stopword_path) otherword = stopwordList(otherword_path) # stopwords += otherword train_data, train_true_data, train_false_data = read_data(train_path) dev_data = read_data(dev_path, dev=True) remove_stopwords_train = [] for sample in tqdm(train_data): q1 = seg_depart(sample[0],stopwords,otherword) q2 = seg_depart(sample[1],stopwords,otherword) remove_stopwords_train.append([q1,q2,sample[2]]) remove_stopwords_test = [] for sample in tqdm(dev_data): q1 = seg_depart(sample[0], stopwords,otherword) q2 = seg_depart(sample[1], stopwords,otherword) remove_stopwords_test.append([q1, q2]) if not os.path.exists(remove_stopwords_train_path) and not os.path.exists(remove_stopwords_test_path): save_data(remove_stopwords_train, remove_stopwords_train_path, columns_num=3) save_data(remove_stopwords_test,remove_stopwords_test_path,columns_num=2) return remove_stopwords_train, remove_stopwords_test
def get_the_final_data_5(dev_samples=-5000): """ 获得最终版本的数据,数据格式为[q1,q2,label],并从原始训练集里切5000条数据作为测试集 注意:只包含增强后的数据,不包含原始数据 :return: """ save_data_dir = "./final_data/final_data_9/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) cilinpath = "./cilin.txt" file_path_json = "./dataset/train_set.json" same_pinyin_file = "./same_pinyin.txt" chinese_word_freq_file = "./chinese-words.txt" save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt" data, true_data, false_data = read_data(file_path_json) data_eva = synword_and_samepinyin_data(data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file,portition=1) # 进行数据增强 # new_data = data_eva + data # 包含增强后的数据 new_data = data_eva all_train_data,all_train_data_1,all_train_data_2,all_train_dat_3 = data_inverse(new_data,pattern=False) dev_data_from_train = new_data[dev_samples:] # 从原始数据集里切500条数据出来作为验证集 new_data = new_data[0:dev_samples] # 剩下的数据作为训练集 all_data, train, dev, test = data_inverse(new_data,pattern=False) # dev_data_from_train_1, dev_data_from_train_pattern = get_data_pattern(dev_data_from_train) all_train_data_path = save_data_dir + "all_train_data.txt" all_data_path_txt = save_data_dir + "train_set.txt" train_path_txt = save_data_dir + "train.txt" test_path_txt = save_data_dir + "test.txt" dev_path_txt = save_data_dir +"dev.txt" dev_from_train_path_txt = save_data_dir + "dev_split.txt" save_data(all_train_data,all_train_data_path,columns_num=3) save_data(all_data,all_data_path_txt,columns_num=3) save_data(train,train_path_txt,columns_num=3) save_data(test,test_path_txt,columns_num=3) save_data(dev,dev_path_txt,columns_num=3) save_data(dev_data_from_train, dev_from_train_path_txt,columns_num=3) # 生成测试集 dev_csv_path = "./dataset/test_set.csv" dev_txt_path = save_data_dir +"test_set.txt" dev = read_data(dev_csv_path,dev=True) save_data_synwords_and_samepinyin_for_dev = save_data_dir + "data_replace_by_synwords_and_samepinyin_for_dev.txt" data_eva = synword_and_samepinyin_data(dev, save_data_synwords_and_samepinyin_for_dev, cilinpath, same_pinyin_file, chinese_word_freq_file, columns_num=2,portition=1) # 进行数据增强 # dev_data, pattern_dev = get_data_pattern(dev,dev=True) save_data(dev, dev_txt_path, columns_num=2)
def get_the_final_data_4(dev_samples=-5000): """ 获得最终版本的数据,数据格式为[q11,q21,q12,q22,q31,label],并从原始训练集里切5000条数据作为测试集 数据增强加到了10000条 q12:q1中与q2不同的词汇 q22:q2中与q1不同的词汇 q31:q1与q2相同的词汇 :return: """ save_data_dir = "./final_data/final_data_6/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) cilinpath = "./cilin.txt" file_path_json = "./dataset/train_set.json" same_pinyin_file = "./same_pinyin.txt" chinese_word_freq_file = "./chinese-words.txt" save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt" data, true_data, false_data = read_data(file_path_json) data_eva = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file,portition=0.1) # 进行数据增强 new_data = data_eva + data # 包含增强后的数据集 print(len(new_data)) all_train_data,all_train_data_pattern = get_data_pattern(new_data,mode=2) all_train_data,all_train_train,all_train_dev,all_train_test = data_inverse(all_train_data,mode=2) print(len(all_train_data)) dev_data_from_train = new_data[dev_samples:] # 从原始数据集里切500条数据出来作为验证集 new_data = new_data[0:dev_samples] # 剩下的数据作为训练集 final_data,pattern_data = get_data_pattern(new_data,mode=2) all_data, train, dev, test = data_inverse(final_data,mode=2) dev_data_from_train_1,dev_data_from_train_pattern = get_data_pattern(dev_data_from_train,mode=2) all_train_data_path = save_data_dir + "all_train_data.txt" all_data_path_txt = save_data_dir + "train_set.txt" train_path_txt = save_data_dir + "train.txt" test_path_txt = save_data_dir + "test.txt" dev_path_txt = save_data_dir + "dev.txt" dev_from_train_path_txt = save_data_dir + "dev_split.txt" save_data(all_train_data,all_train_data_path,columns_num=6) save_data(all_data,all_data_path_txt,columns_num=6) save_data(train,train_path_txt,columns_num=6) save_data(test,test_path_txt,columns_num=6) save_data(dev,dev_path_txt,columns_num=6) save_data(dev_data_from_train_1,dev_from_train_path_txt,columns_num=6) # 生成测试集 dev_csv_path = "./dataset/dev_set.csv" dev_txt_path = save_data_dir + "dev_set.txt" dev = read_data(dev_csv_path,dev=True) dev_data, pattern_dev = get_data_pattern(dev,dev=True,mode=2) save_data(dev_data, dev_txt_path, columns_num=5)
def get_the_final_data_3(dev_samples=-5000): """ 获得最终版本的数据,数据格式为[q11,q21,q12,q22,label],并从原始训练集里切5000条数据作为测试集,pattern为使用通配符替换相同词汇 :return: """ save_data_dir = "./final_data/final_data_10/" if not os.path.exists(save_data_dir): os.mkdir(save_data_dir) cilinpath = "./cilin.txt" file_path_json = "./dataset/train_set.json" same_pinyin_file = "./same_pinyin.txt" chinese_word_freq_file = "./chinese-words.txt" save_data_synwords_and_samepinyin = save_data_dir + "data_replace_by_synwords_and_samepinyin.txt" data, true_data, false_data = read_data(file_path_json) data_eva_true = synword_and_samepinyin_data(true_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file,portition=0.2) # 进行数据增强 data_eva_false = synword_and_samepinyin_data(false_data, save_data_synwords_and_samepinyin, cilinpath, same_pinyin_file, chinese_word_freq_file, portition=0.3) # 进行数据增强 new_data = data_eva_true + data_eva_false + data # 包含增强后的数据集 all_train_data,all_train_data_pattern = get_data_pattern(new_data) all_train_data,all_train_train,all_train_dev,all_train_test = data_inverse(all_train_data) dev_data_from_train = new_data[dev_samples:] # 从原始数据集里切500条数据出来作为验证集 new_data = new_data[0:dev_samples] # 剩下的数据作为训练集 final_data,pattern_data = get_data_pattern(new_data) all_data, train, dev, test = data_inverse(final_data) dev_data_from_train_1,dev_data_from_train_pattern = get_data_pattern(dev_data_from_train) all_train_data_path = save_data_dir + "all_train_data.txt" all_data_path_txt = save_data_dir + "train_set.txt" train_path_txt = save_data_dir + "train.txt" test_path_txt = save_data_dir + "test.txt" dev_path_txt = save_data_dir + "dev.txt" dev_from_train_path_txt = save_data_dir + "dev_split.txt" save_data(all_train_data, all_train_data_path) save_data(all_data,all_data_path_txt) save_data(train,train_path_txt) save_data(test,test_path_txt) save_data(dev,dev_path_txt) save_data(dev_data_from_train_1,dev_from_train_path_txt) # 生成测试集 dev_csv_path = "./dataset/test_set.csv" dev_txt_path = save_data_dir + "test_set.txt" dev = read_data(dev_csv_path,dev=True) dev_data, pattern_dev = get_data_pattern(dev,dev=True) save_data(dev_data, dev_txt_path, columns_num=4)