def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, # 获取中间层输出 get_cnn_middle_layer_output=False, middle_layer_output_file=None, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, ): """ Parameters ---------- train_data : array-like 训练数据 (train_X, train_y)) test_data : array-like 测试数据 cv_data : array-like k份验证数据 input_length : int 输入长度 num_filter_list : array-like 验证参数,number of filters middle_layer_output_file : str 中间层输出到哪个文件 get_cnn_middle_layer_output : bool 是否获取中间层输出(#,False) num_labels: int 标签 batch_size : int batch size vocabulary_including_test_set: bool,default,True 字典是否包括测试集 include_train_data : bool 是否包含训练数据一样验证 need_validation: bool 是否要验证 embedding_weight_trainable : bool 切换 CNN(static-w2v) 和 CNN(non-static-w2v) rand_weight : bool 切换 CNN(rand) or CNN(static/non-static-w2v) feature_type : str 特征类型 verbose : int 数值越大,输出越详细 cv:int 进行 cv 折验证 need_segmented:bool 是否需要分词 word2vec_model_file_path Notes ---------- - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典 - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同 Examples ---------- >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊'] >>> train_y = [1, 2, 3, 2, 3] >>> test_x = ['你好', '不错哟'] >>> test_y = [1, 2] >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']] >>> cv_y = [[1, 3], [2, 2], [3]] >>> WordEmbeddingCNNWithOneConv.cross_validation( >>> train_data = (train_x,train_y), >>> test_data=(test_x,test_y), >>> input_length=8, >>> num_filter_list=[5,50], >>> verbose=1, >>> word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem', >>> ) """ print('=' * 80) print('feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('input_length: %d, num_labels: %d' % (input_length, num_labels)) print('lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr,batch_size, rand_weight, embedding_weight_trainable)) if not rand_weight: print('W2V model file_path: %s' % word2vec_model_file_path) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, padding_mode='center', # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: print('=' * 40) print('num_filter is %d.' % num_filter) _, _, middle_output_dev, middle_output_val = get_val_score( WordEmbeddingCNNWithOneConv, cv_data=cv_data[:], verbose=verbose, num_filter=num_filter, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, get_cnn_middle_layer_output=get_cnn_middle_layer_output, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, ) if get_cnn_middle_layer_output: # 保存结果 with open(middle_layer_output_file, 'w') as fout: # 保存中间结果 pickle.dump(cv_data, fout) pickle.dump(middle_output_dev, fout) pickle.dump(middle_output_val, fout)
label_to_index,index_to_label = data_util.get_label_index(version=config['label_version']) # **************************************************************** # ------------- region end : 1. 加载训练数据和测试数据 ------------- # **************************************************************** # **************************************************************** # +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++ # **************************************************************** logging.debug('=' * 20) logging.debug('2. 转换数据的格式并特征编码') from data_processing_util.cross_validation_util import transform_cv_data from deep_learning.cnn.wordEmbedding_cnn.wordEmbedding_cnn_model import WordEmbeddingCNN feature_encoder = WordEmbeddingCNN.get_feature_encoder( **{'input_length': input_length, 'feature_type':feature_type,} ) train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix()) feature_encoder.print_model_descibe() feature_encoder.print_sentence_length_detail() # train_y = train_data['LABEL_INDEX'].as_matrix() test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix()) test_all_y = test_data['LABEL_INDEX'].as_matrix() print(train_data['LABEL_INDEX'].as_matrix()) print(train_X_feature.shape) print(test_all_X_feature.shape)
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, shuffle_data=True, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, n_estimators_list=None, ): print('=' * 80) print( 'feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('rand_weight:%s,embedding_weight_trainable:%s' % (rand_weight, embedding_weight_trainable)) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: for n_estimators in n_estimators_list: print('=' * 40) print('num_filter and n_estimators is %d,%d.' % (num_filter, n_estimators)) get_val_score( RFAndRFAndWordEmbeddingCnnMerge, num_filter=num_filter, n_estimators=n_estimators, cv_data=cv_data[:], verbose=verbose, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, shuffle_data=shuffle_data, )
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, # 获取中间层输出 get_cnn_middle_layer_output=False, middle_layer_output_file=None, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, ): """ Parameters ---------- train_data : array-like 训练数据 (train_X, train_y)) test_data : array-like 测试数据 cv_data : array-like k份验证数据 input_length : int 输入长度 num_filter_list : array-like 验证参数,number of filters middle_layer_output_file : str 中间层输出到哪个文件 get_cnn_middle_layer_output : bool 是否获取中间层输出(#,False) num_labels: int 标签 batch_size : int batch size vocabulary_including_test_set: bool,default,True 字典是否包括测试集 include_train_data : bool 是否包含训练数据一样验证 need_validation: bool 是否要验证 embedding_weight_trainable : bool 切换 CNN(static-w2v) 和 CNN(non-static-w2v) rand_weight : bool 切换 CNN(rand) or CNN(static/non-static-w2v) feature_type : str 特征类型 verbose : int 数值越大,输出越详细 cv:int 进行 cv 折验证 need_segmented:bool 是否需要分词 word2vec_model_file_path Notes ---------- - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典 - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同 Examples ---------- >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊'] >>> train_y = [1, 2, 3, 2, 3] >>> test_x = ['你好', '不错哟'] >>> test_y = [1, 2] >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']] >>> cv_y = [[1, 3], [2, 2], [3]] >>> WordEmbeddingCNNWithOneConv.cross_validation( >>> train_data = (train_x,train_y), >>> test_data=(test_x,test_y), >>> input_length=8, >>> num_filter_list=[5,50], >>> verbose=1, >>> word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem', >>> ) """ print('=' * 80) print( 'feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('input_length: %d, num_labels: %d' % (input_length, num_labels)) print( 'lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr, batch_size, rand_weight, embedding_weight_trainable)) if not rand_weight: print('W2V model file_path: %s' % word2vec_model_file_path) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, padding_mode='center', # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: print('=' * 40) print('num_filter is %d.' % num_filter) _, _, middle_output_dev, middle_output_val = get_val_score( WordEmbeddingCNNWithOneConv, cv_data=cv_data[:], verbose=verbose, num_filter=num_filter, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, get_cnn_middle_layer_output=get_cnn_middle_layer_output, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, ) if get_cnn_middle_layer_output: # 保存结果 with open(middle_layer_output_file, 'w') as fout: # 保存中间结果 pickle.dump(cv_data, fout) pickle.dump(middle_output_dev, fout) pickle.dump(middle_output_val, fout)