def data_generate(train_data_path,test_data_path): ''' 第一版数据处理#数据加载与预处理 :param train_data_path: :param test_data_path: :return: ''' #1.加载数据 train_df,test_df = load_dataset(train_data_path, test_data_path) print('train data size {},test data size {}'.format(len(train_df), len(test_df))) #2.空值清洗 train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how="any", inplace=True) test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True) #3.并行化处理,多线程处理 train_df = parallelize(train_df, data_frame_proc) test_df = parallelize(test_df, data_frame_proc) # 4. 保存处理完成的数据 train_df.to_csv(train_seg_path, index=None, header=True) test_df.to_csv(test_seg_path, index=None, header=True) #5. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'.format(len(train_df), len(test_df), len(merged_df))) # 6. 保存合并数据 merged_df.to_csv(merger_seg_path, index=None, header=True) return train_df, test_df, merged_df
def preprocess(train_data_path, test_data_path): if os.path.exists(config.train_seg_path) and \ os.path.exists(config.train_seg_path) and \ os.path.exists(config.merger_seg_path): train_df = pd.read_csv(config.train_seg_path) test_df = pd.read_csv(config.train_seg_path) train_df.dropna(subset=['Report'], inplace=True) test_df.dropna(subset=['Report'], inplace=True) train_df.fillna('', inplace=True) test_df.fillna('', inplace=True) else: # 1.加载数据 train_df = pd.read_csv(train_data_path) test_df = pd.read_csv(test_data_path) print('train data size {},test data size {}'.format( len(train_df), len(test_df))) # 2. 空值剔除 train_df.dropna(subset=['Report'], inplace=True) test_df.dropna(subset=['Report'], inplace=True) train_df.fillna('', inplace=True) test_df.fillna('', inplace=True) # 3.多线程, 批量数据处理 train_df = parallelize(train_df, sentences_proc) test_df = parallelize(test_df, sentences_proc) # 4. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'. format(len(train_df), len(test_df), len(merged_df))) # 5.保存处理好的 训练 测试集合 train_df = train_df.drop(['merged'], axis=1) test_df = test_df.drop(['merged'], axis=1) train_df.to_csv(config.train_seg_path, index=False) test_df.to_csv(config.test_seg_path, index=False) # 6. 保存合并数据 merged_df.to_csv(config.merger_seg_path, index=False) return train_df, test_df
def data_loader(params, is_rebuild_dataset=False): if os.path.exists(config.train_x_path) and not is_rebuild_dataset: x_train = np.load(config.train_x_path) x_test = np.load(config.test_x_path) y_train = np.load(config.train_y_path) y_test = np.load(config.test_y_path) with open(config.vocab_save_path, 'r', encoding='utf-8') as f: vocab = {} for content in f.readlines(): k, v = content.strip().split('\t') vocab[k] = int(v) label_df = pd.read_csv(config.data_label_path) # 多标签编码 mlb = MultiLabelBinarizer() mlb.fit([label_df['label']]) return x_train, x_test, y_train, y_test, vocab, mlb df = pd.read_csv(config.data_path, header=None).rename(columns={ 0: 'label', 1: 'content' }) df = parallelize(df, proc) text_preprocesser = tf.keras.preprocessing.text.Tokenizer( num_words=params['vocab_size'], oov_token="<UNK>") text_preprocesser.fit_on_texts(df['content']) vocab = text_preprocesser.word_index with open(config.vocab_save_path, 'w', encoding='utf-8') as f: for k, v in vocab.items(): f.write(f'{k}\t{str(v)}\n') x = text_preprocesser.texts_to_sequences(df['content']) x = tf.keras.preprocessing.sequence.pad_sequences( x, maxlen=params['padding_size'], padding='post', truncating='post') # label_df = pd.read_csv(config.data_label_path) mlb = MultiLabelBinarizer() df['label'] = df['label'].apply(lambda x: x.split()) mlb.fit(df['label']) y = mlb.transform(df['label']) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) np.save(config.train_x_path, x_train) np.save(config.test_x_path, x_test) np.save(config.train_y_path, y_train) np.save(config.test_y_path, y_test) return x_train, x_test, y_train, y_test, vocab, mlb
def data_loader(params, is_rebuild_dataset=False): if os.path.exists(os.path.join(root, 'data', 'X_train.npy')) and not is_rebuild_dataset: X_train = np.load(os.path.join(root, 'data', 'X_train.npy')) X_test = np.load(os.path.join(root, 'data', 'X_test.npy')) y_train = np.load(os.path.join(root, 'data', 'y_train.npy')) y_test = np.load(os.path.join(root, 'data', 'y_test.npy')) return X_train, X_test, y_train, y_test # 读取数据 df = pd.read_csv(params.data_path, header=None).rename(columns={ 0: 'label', 1: 'content' }) # 并行清理数据 df = parallelize(df, proc) # word2index text_preprocesser = Tokenizer(num_words=params.vocab_size, oov_token="<UNK>") text_preprocesser.fit_on_texts(df['content']) # save vocab word_dict = text_preprocesser.word_index with open(params.vocab_save_dir + 'voab.txt', 'w', encoding='utf-8') as f: for k, v in word_dict.items(): f.write(f'{k}\t{str(v)}\n') x = text_preprocesser.texts_to_sequences(df['content']) # padding x = pad_sequences(x, maxlen=params.padding_size, padding='post', truncating='post') # 划分标签 df['label'] = df['label'].apply(lambda x: x.split()) # 多标签编码 mlb = MultiLabelBinarizer() y = mlb.fit_transform(df['label']) # 数据集划分 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # 保存数据 np.save(os.path.join(root, 'data', 'X_train.npy'), X_train) np.save(os.path.join(root, 'data', 'X_test.npy'), X_test) np.save(os.path.join(root, 'data', 'y_train.npy'), y_train) np.save(os.path.join(root, 'data', 'y_test.npy'), y_test) return X_train, X_test, y_train, y_test
def build_data(params): if os.path.exists(os.path.join(root, 'data', 'X_train.npy')): X_train = np.load(os.path.join(root, 'data', 'X_train.npy')) X_test = np.load(os.path.join(root, 'data', 'X_test.npy')) y_train = np.load(os.path.join(root, 'data', 'y_train.npy')) y_test = np.load(os.path.join(root, 'data', 'y_test.npy')) return X_train, X_test, y_train, y_test data = pd.read_csv(params['data_path'],header = None).rename(columns={0: 'label', 1: 'content'}) processed_data = parallelize(data, proc) #word2index text_preprocesser = Tokenizer(num_words=params['vocab_size'], oov_token="<UNK>") text_preprocesser.fit_on_texts(processed_data['content']) #save vocab word_dict = text_preprocesser.word_index with open(params['vocab_path'], 'w', encoding='utf-8') as f: for k, v in word_dict.items(): f.write(f'{k}\t{str(v)}\n') x = text_preprocesser.texts_to_sequences(processed_data['content']) # padding x = pad_sequences(x, maxlen=params['padding_size'], padding='post', truncating='post') # 划分标签 if params['train_mode'] == "multi_label": processed_data['label'] = processed_data['label'].apply(lambda x: x.split()) # 多标签编码 mlb = MultiLabelBinarizer() y = mlb.fit_transform(processed_data['label']) # 数据集划分 elif params['train_mode'] == "multi_class": processed_data['subject'] = processed_data['label'].apply(lambda x: x.split()[1]) print("class category: ", set(processed_data['subject'])) lb=LabelBinarizer() y = lb.fit_transform(processed_data['subject']) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # 保存数据 np.save(os.path.join(root,'data','X_train.npy'),X_train) np.save(os.path.join(root, 'data', 'X_test.npy'), X_test) np.save(os.path.join(root, 'data', 'y_train.npy'), y_train) np.save(os.path.join(root, 'data', 'y_test.npy'), y_test) return X_train, X_test, y_train, y_test, mlb, word_dict # vocab
def build_dataset(train_data_path, test_data_path): ''' 数据加载+预处理 :param train_data_path:训练集路径 :param test_data_path: 测试集路径 :return: 训练数据 测试数据 合并后的数据 ''' # 1.加载数据 train_df = pd.read_csv(train_data_path) test_df = pd.read_csv(test_data_path) print('train data size {},test data size {}'.format( len(train_df), len(test_df))) # 2. 空值剔除 train_df.dropna(subset=['Report'], inplace=True) train_df.fillna('', inplace=True) test_df.fillna('', inplace=True) # 3.多线程, 批量数据处理 train_df = parallelize(train_df, sentences_proc) test_df = parallelize(test_df, sentences_proc) # 4. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'.format( len(train_df), len(test_df), len(merged_df))) # 5.保存处理好的 训练 测试集合 train_df = train_df.drop(['merged'], axis=1) test_df = test_df.drop(['merged'], axis=1) train_df.to_csv(train_seg_path, index=None, header=False) test_df.to_csv(test_seg_path, index=None, header=False) # 6. 保存合并数据 merged_df.to_csv(merger_seg_path, index=None, header=False) # 7. 训练词向量 print('start build w2v model') wv_model = Word2Vec(LineSentence(merger_seg_path), size=embedding_dim, sg=1, workers=8, iter=wv_train_epochs, window=5, min_count=5) # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) # 训练集 验证集划分 X_train, X_val, y_train, y_val = train_test_split( train_df['X'], train_df['Report'], test_size=0.002, # 8W*0.002 ) X_train.to_csv(train_x_seg_path, index=None, header=False) y_train.to_csv(train_y_seg_path, index=None, header=False) X_val.to_csv(val_x_seg_path, index=None, header=False) y_val.to_csv(val_y_seg_path, index=None, header=False) test_df['X'].to_csv(test_x_seg_path, index=None, header=False) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.vocab # 训练集X处理 # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) train_df['X'] = train_df['X'].apply( lambda x: pad_proc(x, X_max_len, vocab)) # 测试集X处理 # 获取适当的最大长度 test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply( lambda x: pad_proc(x, train_y_max_len, vocab)) # 10. 保存pad oov处理后的,数据和标签 train_df['X'].to_csv(train_x_pad_path, index=None, header=False) train_df['Y'].to_csv(train_y_pad_path, index=None, header=False) test_df['X'].to_csv(test_x_pad_path, index=None, header=False) # # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len)) # 11. 词向量再次训练 # print('start retrain w2v model') # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('1/3') # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count) # # print('2/3') # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count) # 保存词向量模型 wv_model.save(save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab)) # 12. 更新vocab vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)} reverse_vocab = { index: word for index, word in enumerate(wv_model.wv.index2word) } # 保存字典 save_dict(save_vocab_path, vocab) save_dict(reverse_vocab_path, reverse_vocab) # 13. 保存词向量矩阵 embedding_matrix = wv_model.wv.vectors np.save(embedding_matrix_path, embedding_matrix) # 14. 数据集转换 将词转换成索引 [<START> 方向机 重 ...] -> [32800, 403, 986, 246, 231 # vocab = Vocab() train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab)) train_ids_y = train_df['Y'].apply(lambda x: transform_data(x, vocab)) test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab)) # 15. 数据转换成numpy数组 # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800, 403, 986 ]] train_X = np.array(train_ids_x.tolist()) train_Y = np.array(train_ids_y.tolist()) test_X = np.array(test_ids_x.tolist()) # 保存数据 np.save(train_x_path, train_X) np.save(train_y_path, train_Y) np.save(test_x_path, test_X) return train_X, train_Y, test_X
:param data: 待统计的数据 train_df['Question'] :return: 最大长度值 """ max_lens = data.apply(lambda x: x.count(' ') + 1) return int(np.mean(max_lens) + 2 * np.std(max_lens)) if __name__ == '__main__': train_df = pd.read_csv(config.train_data_path) test_df = pd.read_csv(config.test_data_path) train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how='any', inplace=True) test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True) train_df = parallelize(train_df, process_seq2seq) test_df = parallelize(test_df, process_seq2seq) train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) # 获取输入数据 适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_x_max_len = get_max_len(test_df['X']) x_max_len = max(train_x_max_len, test_x_max_len) # 获取标签数据 适当的最大长度 train_y_max_len = get_max_len(train_df['Report'])
def build_dataset(train_data_path, test_data_path, save_wv_model_path, testOnly=True, toCSV=True): ''' 数据加载+预处理 :param train_data_path:训练集路径 :param test_data_path: 测试集路径 :return: 训练数据 测试数据 合并后的数据 ''' # 1.加载数据 train_df = pd.read_csv(train_data_path) test_df = pd.read_csv(test_data_path) print('train data size {},test data size {}'.format( len(train_df), len(test_df))) # 2. 空值填充 train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how='any', inplace=True) test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True) # 3.多进程, 批量数据处理 train_df = parallelize(train_df, sentences_proc) test_df = parallelize(test_df, sentences_proc) # 4. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'.format( len(train_df), len(test_df), len(merged_df))) # 5.保存处理好的 训练 测试集合 train_df = train_df.drop(['merged'], axis=1) test_df = test_df.drop(['merged'], axis=1) if toCSV: train_df.to_csv(train_seg_path, index=None, header=True) test_df.to_csv(test_seg_path, index=None, header=True) # 6. 保存合并数据 merged_df.to_csv(merger_seg_path, index=None, header=False) if osp.exists(save_wv_model_path): wv_model = Word2Vec.load(save_wv_model_path) else: # 7. 训练词向量 print('start build w2v model') wv_model = Word2Vec(LineSentence(merger_seg_path), size=embedding_dim, negative=5, workers=8, iter=wv_train_epochs, window=3, min_count=5) # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.vocab # 训练集X处理 # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) train_df['X'] = train_df['X'].apply( lambda x: pad_proc(x, X_max_len, vocab)) # 测试集X处理 # 获取适当的最大长度 test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply( lambda x: pad_proc(x, train_y_max_len, vocab)) # 10. 保存pad oov处理后的,数据和标签 if toCSV: train_df['X'].to_csv(train_x_pad_path, index=None, header=False) train_df['Y'].to_csv(train_y_pad_path, index=None, header=False) test_df['X'].to_csv(test_x_pad_path, index=None, header=False) if testOnly: print("No retraining! Test only...") return train_df['X'], train_df['Y'], test_df['X'], wv_model else: # 11. 词向量再次训练 print('start retrain w2v model') wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) wv_model.train(LineSentence(train_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) print('1/3') wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) wv_model.train(LineSentence(train_y_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) print('2/3') wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) wv_model.train(LineSentence(test_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) # 保存词向量模型 wv_model.save(save_wv_model_path) # or load wv_model # wv_model = Word2Vec.load(save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab)) return train_df['X'], train_df['Y'], test_df['X'], wv_model
def build_dataset(train_data_path, test_data_path, word2vec_type=True): ''' 构建数据集 :param train_data_path: :param test_data_path: :param w2v_model_trained_path: 如果有已经训练好的词向量 :return: ''' # 1.加载数据 train_df, test_df = load_dataset(train_data_path, test_data_path) print('train data size {},test data size {}'.format(len(train_df), len(test_df))) # 2.空值清洗 train_df.dropna(subset=['Report'], inplace=True) #test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True) train_df.fillna('', inplace=True) test_df.fillna('', inplace=True) # 3.并行化处理,多线程处理 train_df = parallelize(train_df, data_frame_proc) test_df = parallelize(test_df, data_frame_proc) # 4. 保存处理完成的数据 train_df.to_csv(train_seg_path, index=None, header=True) test_df.to_csv(test_seg_path, index=None, header=True) # 5. 合并训练测试集合 train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1) test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0) train_df = train_df.drop(['merged'], axis=1) test_df = test_df.drop(['merged'], axis=1) # 6. 保存合并数据 train_df.to_csv(train_seg_path, index=None, header=True) test_df.to_csv(test_seg_path, index=None, header=True) merged_df.to_csv(merger_seg_path, index=None, header=False) print('train data size {},test data size {},merged_df data size {}'.format(len(train_df), len(test_df), len(merged_df))) #7.词向量训练 print('start build w2v model') if word2vec_type: wv_model = Word2Vec(LineSentence(merger_seg_path), size=embedding_dim, negative=5, workers=8, iter=wv_train_epochs, window=3, min_count=5) else: wv_model = FastText(LineSentence(merger_seg_path), workers=8, min_count=5, size=300, window = 3,iter=wv_train_epochs) # 8. 分离数据和标签 train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1) # 划分训练测试集 x_train, x_val, y_train,y_val = train_test_split(train_df['X'], train_df['Report'],test_size = 0.002 ) x_train.to_csv(train_x_seg_path, index=None, header=False) y_train.to_csv(train_y_seg_path, index=None, header=False) x_val.to_csv(val_x_seg_path, index=None, header=False) y_val.to_csv(val_y_seg_path, index=None, header=False) test_df['X'].to_csv(test_x_seg_path, index=None, header=False) # 9. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.vocab # 获取适当的最大长度 train_x_max_len = get_max_len(train_df['X']) test_X_max_len = get_max_len(test_df['X']) X_max_len = max(train_x_max_len, test_X_max_len) print("training sequence length is: ", X_max_len) train_df['X'] = train_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab)) # 训练集Y处理 # 获取适当的最大长度 train_y_max_len = get_max_len(train_df['Report']) train_df['Y'] = train_df['Report'].apply(lambda x: pad_proc(x, train_y_max_len, vocab)) print("report sequence length is: ", train_y_max_len) # 10. 保存pad oov处理后的,数据和标签 train_df['X'].to_csv(train_x_pad_path, index=None, header=False) train_df['Y'].to_csv(train_y_pad_path, index=None, header=False) test_df['X'].to_csv(test_x_pad_path, index=None, header=False) # 11. 词向量再次训练 这里重新封装一下 # print('start retrain w2v model') # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True) # wv_model.train(LineSentence(train_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) # print('1/3') # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True) # wv_model.train(LineSentence(train_y_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) # print('2/3') # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True) # wv_model.train(LineSentence(test_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count) # 保存词向量模型 wv_model.save(save_wv_model_path) print('finish retrain w2v model') print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab)) #12 更新词典 vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)} reverse_vocab = {index: word for index, word in enumerate(wv_model.wv.index2word)} #保存词典 save_dict(vocab_path,vocab) save_dict(reverse_vocab_path, reverse_vocab) #13 保存词向量矩阵 embedding_matrix = wv_model.wv.vectors np.save(embedding_matrix_path, embedding_matrix) #14 将数据集转换为索引 vocab = Vocab() train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab)) train_ids_y = train_df['Y'].apply(lambda x: transform_data(x,vocab)) test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab)) # 15. 数据转换成numpy数组 # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800, 403, 986 ]] train_X = np.array(train_ids_x.tolist()) train_Y = np.array(train_ids_y.tolist()) test_X = np.array(test_ids_x.tolist()) # 保存数据 np.save(train_x_path, train_X) np.save(train_y_path, train_Y) np.save(test_x_path, test_X) return train_X, train_Y, test_X
def build_dataset(search_dev_data_path, zhidao_dev_data_path): ''' 数据加载+预处理 :param search_dev_data_path:search集路径 :param zhidao_dev_data_path: zhidao集路径 :return: 合并后的数据 ''' # 1.加载数据 search_dev_df = pd.read_json(search_dev_data_path, lines=True) zhidao_dev_df = pd.read_json(zhidao_dev_data_path,encoding='utf-8', lines=True) print('search dev data size {},zhidao dev data size {}'.format(len(search_dev_df), len(zhidao_dev_df))) # print(search_dev_data.columns) search_dev_df['answers'] = search_dev_df[['answers']].apply(sentence_proc, axis=1) search_dev_df['entity_answers'] = search_dev_df[['entity_answers']].apply(sentences_proc, axis=1) search_dev_df['documents'] = search_dev_df[['documents']].apply(documents_proc, axis=1) zhidao_dev_df['answers'] = zhidao_dev_df[['answers']].apply(sentence_proc, axis=1) zhidao_dev_df['entity_answers'] = zhidao_dev_df[['entity_answers']].apply(sentences_proc, axis=1) zhidao_dev_df['documents'] = zhidao_dev_df[['documents']].apply(documents_proc, axis=1) # print(search_dev_df["documents"]) # print(search_dev_df['entity_answers']) # print(search_dev_df['question']) # print(search_dev_df['answers']) # print(zhidao_dev_df["documents"]) # print(zhidao_dev_df['entity_answers']) # print(zhidao_dev_df['question']) # print(zhidao_dev_df['answers']) # 3.多线程, 批量数据处理 search_dev_df = parallelize(search_dev_df, split_sentences_proc) zhidao_dev_df = parallelize(zhidao_dev_df, split_sentences_proc) # 4. 合并训练测试集合 search_dev_df['merged'] = search_dev_df[['documents', 'entity_answers', 'question', 'answers']].apply(lambda x: ' '.join(x), axis=1) zhidao_dev_df['merged'] = search_dev_df[['documents', 'entity_answers', 'question', 'answers']].apply(lambda x: ' '.join(x), axis=1) merged_df = pd.concat([search_dev_df[['merged']], zhidao_dev_df[['merged']]], axis=0) print('train data size {},test data size {},merged_df data size {}'.format(len(search_dev_df), len(zhidao_dev_df), len(merged_df))) # 6. 保存合并数据 merged_df.to_csv(merger_dev_seg_path, index=None, header=False) # 7. 训练词向量 print('start build w2v model') wv_model = Word2Vec(LineSentence(merger_dev_seg_path), size=embedding_dim, sg=1, workers=cores, iter=wv_train_epochs, window=5, min_count=5) # 8. 填充开始结束符号,未知词填充 oov, 长度填充 # 使用GenSim训练得出的vocab vocab = wv_model.wv.vocab # 9、保存字典 save_dict(vocab_path, vocab) # 10、保存词向量模型 wv_model.save(save_wv_model_path) return