def train_5_round(): # 设置内存自适应 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) KTF.set_session(session) f1_record = [] for round_id in range(1, 6): f1_record.append(['round', round_id]) vocabulary, embed_matrix = dh.load_embed(file_name='../data/word_embedding.txt') train_file_name = '../data/train.csv' test_file_name = '../data/test.csv' test_data = pd.read_csv(test_file_name) q1, q2, y = data_pipeline(train_file_name, vocabulary=vocabulary, return_label=True) q1_test, q2_test = data_pipeline(test_file_name, vocabulary=vocabulary, return_label=False) kf = KFold(n_splits=5) split_id = 0 for t_idx, v_idx in kf.split(q1, q2, y): split_id += 1 print('Round [{}] fold [{}]'.format(round_id, split_id)) q1_train = q1[t_idx] q1_val = q1[v_idx] q2_train = q2[t_idx] q2_val = q2[v_idx] t_y = y[t_idx] v_y = y[v_idx] keras.backend.clear_session() model = esim(embeddings=embed_matrix, maxlen=config['maxlen']) model_path = '../models/esim_word/roung_{}_kf_{}.h5'.format(round_id, split_id) cb = CB(val_data=([q1_val, q2_val], v_y), model_path=model_path) model.fit(x=[q1_train, q2_train], y=t_y, batch_size=config['batch_size'], epochs=config['epochs'], validation_data=([q1_val, q2_val], v_y), verbose=True, callbacks=[cb]) model.load_weights(model_path) # 保存max f1 max_f1 = cb.get_max_f1() f1_record.append([split_id, max_f1]) f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1']) f1_record_df.to_csv('../log/5round_esim_word_f1.csv', index=False) print('Kfold [{}] max f1: {}'.format(split_id, max_f1)) # 保存在验证集上的结果 v_pred = model.predict(x=[q1_val, q2_val]) np.save('../prediction/esim_word/round_{}_train_kf_{}.npy'.format(round_id, split_id), v_pred) # 保存在测试集上的结果 y_pred = model.predict(x=[q1_test, q2_test]) np.save('../prediction/esim_word/round_{}_test_kf_{}.npy'.format(round_id, split_id), y_pred) for item in f1_record: print('{}\t{}'.format(item[0], item[1]))
def train(): tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) KTF.set_session(session) vocabulary, embed_matrix = dh.load_embed( file_name='../data/word_embedding.txt') train_file_name = '../data/train.csv' test_file_name = '../data/test.csv' test_data = pd.read_csv(test_file_name) q1, q2, y = data_pipeline(train_file_name, vocabulary=vocabulary, return_label=True, maxlen=config['maxlen']) q1_test, q2_test = data_pipeline(test_file_name, vocabulary=vocabulary, return_label=False, maxlen=config['maxlen']) kf = KFold(n_splits=5) split_id = 0 f1_record = [] for t_idx, v_idx in kf.split(q1, q2, y): split_id += 1 print('## KFOLD [{}]'.format(split_id)) q1_train = q1[t_idx] q1_val = q1[v_idx] q2_train = q2[t_idx] q2_val = q2[v_idx] train_y = y[t_idx] val_y = y[v_idx] model = decomposable_attention(embeddings=embed_matrix, maxlen=config['maxlen']) model_path = config['model_path'].format(split_id) cb = CB(val_data=([q1_val, q2_val], val_y), model_path=model_path) model.fit(x=[q1_train, q2_train], y=train_y, batch_size=config['batch_size'], epochs=config['epochs'], validation_data=([q1_val, q2_val], val_y), verbose=True, callbacks=[cb]) model.load_weights(model_path) max_f1 = cb.get_max_f1() print('kfold [{}] max f1: {}'.format(split_id, max_f1)) f1_record.append([split_id, max_f1]) f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1']) f1_record_df.to_csv(config['log_path'], index=False)
def train(): tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) KTF.set_session(session) vocabulary, embed_matrix = dh.load_embed( file_name='../data/char_embedding.txt') train_file_name = '../data/train.csv' test_file_name = '../data/test.csv' q1, q2, y = data_pipeline(train_file_name, vocabulary=vocabulary, return_label=True, maxlen=config['maxlen']) q1_test, q2_test = data_pipeline(test_file_name, vocabulary=vocabulary, return_label=False, maxlen=config['maxlen']) kf = KFold(n_splits=5) f1_record = [] split_id = 0 for t_idx, v_idx in kf.split(q1, q2, y): split_id += 1 q1_train = q1[t_idx] q1_val = q1[v_idx] q2_train = q2[t_idx] q2_val = q2[v_idx] train_y = y[t_idx] val_y = y[v_idx] # 设置内存自适应 keras.backend.clear_session() model = get_model(embed_matrix=embed_matrix, maxlen=config['maxlen']) model_path = '../models/tmp.h5'.format(split_id) cb = CB(val_data=([q1_val, q2_val], val_y), model_path=model_path) model.fit(x=[q1_train, q2_train], y=train_y, batch_size=config['batch_size'], epochs=config['epochs'], validation_data=([q1_val, q2_val], val_y), callbacks=[cb]) # 保存f1 max_f1 = cb.get_max_f1() f1_record.append([split_id, max_f1]) f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1']) # f1_record_df.to_csv(config['log_file'], index=False) # # 验证集上的预测结果 # model.load_weights(model_path) # v_pred = model.predict([q1_val, q2_val]) # np.save('../prediction/bimpm_char/train_kf_{}.npy'.format(split_id), v_pred) # # # 测试集合上预测结果 # y_pred = model.predict([q1_test, q2_test]) # np.save('../prediction/bimpm_char/test_kf_{}.npy'.format(split_id), y_pred) for item in f1_record: print('{}\t{}'.format(item[0], item[1]))
def train_5_round(): # 设置内存自适应 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) KTF.set_session(session) f1_record = [] for round_id in range(1, 6): f1_record.append(['round', round_id]) vocabulary, embed_matrix = dh.load_embed( file_name='../data/char_embedding.txt') gl_vocabulary, gl_embed_matrix = dh.load_embed( file_name='../data/gl_char_vectors.txt') train_file_name = '../data/train.csv' test_file_name = '../data/test.csv' q1, q2, y = data_pipeline(train_file_name, vocabulary=vocabulary, return_label=True, maxlen=config['maxlen']) q1_gl, q2_gl = data_pipeline(train_file_name, vocabulary=gl_vocabulary, return_label=False, maxlen=config['maxlen']) q1_test, q2_test = data_pipeline(test_file_name, vocabulary=vocabulary, return_label=False, maxlen=config['maxlen']) q1_test_gl, q2_test_gl = data_pipeline(test_file_name, vocabulary=gl_vocabulary, return_label=False, maxlen=config['maxlen']) train_feature = pd.read_csv('../feature/train_feature.csv') train_feature = train_feature.values test_feature = pd.read_csv('../feature/test_feature.csv') test_feature = test_feature.values kf = KFold(n_splits=5) split_id = 0 for t_idx, v_idx in kf.split(q1, q2, y): split_id += 1 print('Round [{}] fold [{}]'.format(round_id, split_id)) q1_train = q1[t_idx] q1_val = q1[v_idx] q2_train = q2[t_idx] q2_val = q2[v_idx] q1_gl_train = q1_gl[t_idx] q1_gl_val = q1_gl[v_idx] q2_gl_train = q2_gl[t_idx] q2_gl_val = q2_gl[v_idx] f_train = train_feature[t_idx] f_val = train_feature[v_idx] t_y = y[t_idx] v_y = y[v_idx] print('Ratio : {}'.format(np.sum(v_y) / len(v_y))) # 交换q1 和q2 的位置,形成新的数据 # tmp1 = np.concatenate([q1_train, q2_train], axis=0) # tmp2 = np.concatenate([q2_train, q1_train], axis=0) # tmp3 = np.concatenate([q1_gl_train, q2_gl_train], axis=0) # tmp4 = np.concatenate([q2_gl_train, q1_gl_train], axis=0) # # q1_train = copy.deepcopy(tmp1) # q2_train = copy.deepcopy(tmp2) # q1_gl_train = copy.deepcopy(tmp3) # q2_gl_train = copy.deepcopy(tmp4) # f_train = np.concatenate([f_train, f_train], axis=0) # t_y = copy.deepcopy(np.concatenate([t_y, t_y], axis=0)) keras.backend.clear_session() model = esim(embeddings=embed_matrix, gl_embed_matrix=gl_embed_matrix, maxlen=config['maxlen'], lr=0.001, f_dim=test_feature.shape[1]) model_path = '../models/esim_char_gl_feature/roung_{}_fold_{}.h5'.format( round_id, split_id) cb = CB(val_data=([q1_val, q2_val, q1_gl_val, q2_gl_val, f_val], v_y), model_path=model_path) model.fit( x=[q1_train, q2_train, q1_gl_train, q2_gl_train, f_train], y=t_y, batch_size=config['batch_size'], epochs=config['epochs'], validation_data=([q1_val, q2_val, q1_gl_val, q2_gl_val, f_val], v_y), verbose=True, callbacks=[cb]) # 加载最优模型 model.load_weights(model_path) # 保存各个fold的max f1 max_f1 = cb.get_max_f1() f1_record.append([split_id, max_f1]) f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1']) f1_record_df.to_csv('../log/5round_esim_char_gl_feature_f1.csv', index=False) print('Kfold [{}] max f1: {}'.format(split_id, max_f1)) # 保存验证集上的预测结果 v_pred = model.predict( x=[q1_val, q2_val, q1_gl_val, q2_gl_val, f_train]) np.save( '../prediction/esim_char_gl_feature/round_{}_train_kf_{}.npy'. format(round_id, split_id), v_pred) y_pred = model.predict( x=[q1_test, q2_test, q1_test_gl, q2_test_gl, test_feature]) np.save( '../prediction/esim_char_gl_feature/round_{}_test_kf_{}.npy'. format(round_id, split_id), y_pred) # 输出5round的结果 for item in f1_record: print('{}\t{}'.format(item[0], item[1]))
def train(): # 设置内存自适应 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) KTF.set_session(session) vocabulary, embed_matrix = dh.load_embed( file_name='../data/char_embedding.txt') gl_vocabulary, gl_embed_matrix = dh.load_embed( file_name='../data/gl_char_vectors.txt') train_file_name = '../data/train.csv' test_file_name = '../data/test.csv' test_data = pd.read_csv(test_file_name) q1, q2, y = data_pipeline(train_file_name, vocabulary=vocabulary, return_label=True, maxlen=config['maxlen']) q1_gl, q2_gl = data_pipeline(train_file_name, vocabulary=gl_vocabulary, return_label=False, maxlen=config['maxlen']) q1_test, q2_test = data_pipeline(test_file_name, vocabulary=vocabulary, return_label=False, maxlen=config['maxlen']) train_feature = pd.read_csv('../feature/train_feature.csv') train_feature = train_feature.values test_feature = pd.read_csv('../feature/test_feature.csv') test_feature = test_feature.values print('Feature dim : {}'.format(test_feature.shape[-1])) kf = KFold(n_splits=5) split_id = 0 f1_record = [] for t_idx, v_idx in kf.split(q1, q2, y): split_id += 1 print('Kfold [{}]'.format(split_id)) q1_train = q1[t_idx] q1_val = q1[v_idx] q2_train = q2[t_idx] q2_val = q2[v_idx] q1_gl_train = q1_gl[t_idx] q1_gl_val = q1_gl[v_idx] q2_gl_train = q2_gl[t_idx] q2_gl_val = q2_gl[v_idx] f_train = train_feature[t_idx] f_val = train_feature[v_idx] t_y = y[t_idx] v_y = y[v_idx] keras.backend.clear_session() model = esim(embeddings=embed_matrix, gl_embed_matrix=gl_embed_matrix, maxlen=config['maxlen'], lr=0.001, f_dim=test_feature.shape[1]) model_path = config['model_path'].format(split_id) cb = CB(val_data=([q1_val, q2_val, q1_gl_val, q2_gl_val, f_val], v_y), model_path=model_path) model.fit( x=[q1_train, q2_train, q1_gl_train, q2_gl_train, f_train], y=t_y, batch_size=config['batch_size'], epochs=config['epochs'], validation_data=([q1_val, q2_val, q1_gl_val, q2_gl_val, f_val], v_y), verbose=True, callbacks=[cb]) # 保存各个fold的max f1 max_f1 = cb.get_max_f1() f1_record.append([split_id, max_f1]) for item in f1_record: print('{}\t{}'.format(item[0], item[1]))
def train_5_round(): tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) KTF.set_session(session) f1_record = [] for round_id in range(1, 6): f1_record.append(['round', round_id]) vocabulary, embed_matrix = dh.load_embed( file_name='../data/char_embedding.txt') train_file_name = '../data/train.csv' test_file_name = '../data/test.csv' q1, q2, y = data_pipeline(train_file_name, vocabulary=vocabulary, return_label=True, maxlen=config['maxlen']) q1_test, q2_test = data_pipeline(test_file_name, vocabulary=vocabulary, return_label=False, maxlen=config['maxlen']) kf = KFold(n_splits=5) split_id = 0 for t_idx, v_idx in kf.split(q1, q2, y): split_id += 1 print('Round [{}] fold [{}]'.format(round_id, split_id)) q1_train = q1[t_idx] q1_val = q1[v_idx] q2_train = q2[t_idx] q2_val = q2[v_idx] train_y = y[t_idx] val_y = y[v_idx] model = decomposable_attention(embeddings=embed_matrix, maxlen=config['maxlen']) model_path = '../models/de_att_char/round_{}_kf_{}'.format( round_id, split_id) cb = CB(val_data=([q1_val, q2_val], val_y), model_path=model_path) model.fit(x=[q1_train, q2_train], y=train_y, batch_size=config['batch_size'], epochs=config['epochs'], validation_data=([q1_val, q2_val], val_y), verbose=True, callbacks=[cb]) model.load_weights(model_path) max_f1 = cb.get_max_f1() print('Round [{}] fold [{}] max f1: {}'.format( round_id, split_id, max_f1)) f1_record.append([split_id, max_f1]) f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1']) f1_record_df.to_csv('../log/5round_de_att_char_f1.csv', index=False) # 验证集上的预测结果 v_pred = model.predict([q1_val, q2_val]) np.save( '../prediction/de_att_char/round_{}_train_kf_{}.npy'.format( round_id, split_id), v_pred) y_pred = model.predict([q1_test, q2_test]) np.save( '../prediction/de_att_char/round_{}_test_kf_{}.npy'.format( round_id, split_id), y_pred) pass
def train(): # 设置内存自适应 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) KTF.set_session(session) vocabulary, embed_matrix = dh.load_embed( file_name='../data/char_embedding.txt') train_file_name = '../data/train.csv' test_file_name = '../data/test.csv' test_data = pd.read_csv(test_file_name) q1, q2, y = data_pipeline(train_file_name, vocabulary=vocabulary, return_label=True, maxlen=config['maxlen']) q1_test, q2_test = data_pipeline(test_file_name, vocabulary=vocabulary, return_label=False, maxlen=config['maxlen']) result = [] kf = KFold(n_splits=5) split_id = 0 f1_record = [] for t_idx, v_idx in kf.split(q1, q2, y): split_id += 1 print('Kfold [{}]'.format(split_id)) q1_train = q1[t_idx] q1_val = q1[v_idx] q2_train = q2[t_idx] q2_val = q2[v_idx] t_y = y[t_idx] v_y = y[v_idx] keras.backend.clear_session() model = esim(embeddings=embed_matrix, maxlen=config['maxlen'], lr=0.001) model_path = config['model_path'].format(split_id) cb = CB(val_data=([q1_val, q2_val], v_y), model_path=model_path) model.fit(x=[q1_train, q2_train], y=t_y, batch_size=config['batch_size'], epochs=config['epochs'], validation_data=([q1_val, q2_val], v_y), verbose=True, callbacks=[cb]) # 加载最优模型 model.load_weights(model_path) # 保存各个fold的max f1 max_f1 = cb.get_max_f1() f1_record.append([split_id, max_f1]) f1_record_df = pd.DataFrame(f1_record, columns=['fold', 'f1']) f1_record_df.to_csv(config['log_file'], index=False) print('Kfold [{}] max f1: {}'.format(split_id, max_f1)) # # 保存验证集上的预测结果 # v_pred = model.predict(x=[q1_val, q2_val]) # np.save('../prediction/esim_char/train_kf_{}.npy'.format(split_id), v_pred) # # y_pred = model.predict(x=[q1_test, q2_test]) # np.save('../prediction/esim_char/test_kf_{}.npy'.format(split_id), y_pred) for item in f1_record: print('{}\t{}'.format(item[0], item[1]))