def test_single_bow(): train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['句子,句子', '你好', '你妹'] test_y = [2, 3, 0] # 生成字词组合级别的特征 from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='word_seg', max_features=2000, ) train_X_feature = feature_encoder.fit_transform(train_X) test_X_feature = feature_encoder.transform(test_X) print(feature_encoder.vocabulary_size) print(','.join(feature_encoder.vocabulary)) print(train_X_feature) print(test_X_feature) bow_cnn = SingleChannelBowCNN( rand_seed=1337, verbose=1, feature_encoder=feature_encoder, num_labels=4, input_length=feature_encoder.vocabulary_size, l1_conv_filter_type=[ [5, 2, 1, 'valid', (2, 1), 0., 'relu', 'batch_normalization'], [5, 4, 1, 'valid', (2, 1), 0., 'relu', 'batch_normalization'], # [5, 4, 1, 'valid',(-2,1),0.], # [5, 6, 1, 'valid',(-2,1),0.], ], l2_conv_filter_type=[[3, 2, 1, 'valid', (2, 1), 0., 'relu', 'none']], full_connected_layer_units=[(50, 0.25, 'relu', 'none'), (100, 0.25, 'relu', 'none')], # full_connected_layer_units=[50, 100], output_dropout_rate=0., nb_epoch=30, earlyStoping_patience=50, optimizers='sgd', batch_size=2, ) bow_cnn.print_model_descibe() # bow_cnn.model_from_pickle('model/AA.pkl') print(bow_cnn.fit((train_X_feature, trian_y), (test_X_feature, test_y))) print(bow_cnn.predict('你好', transform_input=True)) # print(bow_cnn.get_layer_output(['你好'],layer='conv2', transform_input=True)) print( bow_cnn.get_layer_output(['好'], layer='hidden2', transform_input=True)) # print(bow_cnn.get_layer_output(['好'], layer='batchnormalization',transform_input=True)) bow_cnn.accuracy((test_X_feature, test_y)) print(bow_cnn.batch_predict(test_X, True)) print(bow_cnn.batch_predict(test_X_feature, False))
def get_feature_encoder(**kwargs): ''' 返回 该模型的输入 特征编码器 :param kwargs: 可设置参数 [ full_mode(#,False), feature_type(#,word),verbose(#,0)],word2vec_to_solve_oov[#,False],word2vec_model_file_path[#,None],加*表示必须提供,加#表示可选,不写则默认。 :return: ''' feature_encoder = FeatureEncoder( verbose=kwargs.get('verbose', 0), need_segmented=kwargs.get('need_segmented', True), full_mode=kwargs.get('full_mode', False), replace_number=True, remove_stopword=True, lowercase=True, add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), zhs2zht=True, remove_url=True, feature_method='bow', max_features=2000, word2vec_to_solve_oov=kwargs.get('word2vec_to_solve_oov', False), word2vec_model_file_path=kwargs.get('word2vec_model_file_path', None)) if kwargs.get('verbose', 0) > 0: pprint.pprint(kwargs) return feature_encoder
# endregion if __name__ == '__main__': train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['你妹', '句子', '你好'] test_y = [2, 3, 0] feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', max_features=2000, feature_type='seg', ) train_X_feature = feature_encoder.fit_transform(train_X) test_X_feature = feature_encoder.transform(test_X) print(','.join(feature_encoder.vocabulary)) print(train_X_feature) bow_rf = BowRandomForest( rand_seed=1337, verbose=0,
return detail if __name__ == '__main__': train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['句子,句子', '你好', '你妹'] test_y = [2, 3, 0] # 生成词级别的特征 seg_feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=2000, ) train_seg_X_feature = seg_feature_encoder.fit_transform(train_X) test_seg_X_feature = seg_feature_encoder.transform(test_X) print(seg_feature_encoder.vocabulary_size) print(','.join(seg_feature_encoder.vocabulary)) print(train_seg_X_feature) print(test_seg_X_feature) # 生成字级别的特征 from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 --------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('开始生成特征向量...') logging.debug('使用 %s 提取特征向量'%(config['model'])) print('使用 %s 提取特征向量'%(config['model'])) if config['refresh_all_model']: feature_encoder = FeatureEncoder( verbose=config['verbose'], need_segmented=True, full_mode=config['full_mode'], remove_stopword=config['remove_stopword'], replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method=config['model'], max_features=config['max_features'], ) train_X_feature = feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix()) test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) print(train_X_feature) feature_encoder.print_model_descibe() keywords = feature_encoder.vocabulary print ','.join(keywords) print len(keywords) else: feature_encoder = None logging.debug('fit X shape is :%s'%(str(train_X_feature.shape)))
# print(train_data.head()) logging.debug('=' * 20) logging.debug('开始生成特征向量...') from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder from data_processing_util.cross_validation_util import transform_cv_data feature_encoder = FeatureEncoder( verbose=config['verbose'], need_segmented=True, full_mode=config['full_mode'], remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method=config['model'], feature_type=feature_type, max_features=config['max_features'], word2vec_to_solve_oov=word2vec_to_solve_oov, word2vec_model_file_path=config['word2vec_model_file_path'], ) # print(train_X_feature) # feature_encoder.print_model_descibe() # keywords = feature_encoder.vocabulary # print ','.join(keywords) # print '字典个数:%d'%len(keywords)
# ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 --------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('开始生成特征向量...') logging.debug('使用 %s 提取特征向量'%(config['model'])) print('使用 %s 提取特征向量'%(config['model'])) seg_feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=2000, ) train_seg_X_feature = seg_feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix()) test_seg_X_feature = seg_feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) print(seg_feature_encoder.vocabulary_size) print(','.join(seg_feature_encoder.vocabulary)) # print(train_X_feature) # print(test_X_feature) # 生成字级别的特征 word_feature_encoder = FeatureEncoder(
def process_train_data_for_k_fold(k=3): ''' 将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。 处理文件: v2.2/v2.2_train_Sa_884.csv 输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv :return: ''' from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() feature_type = 'seg' # L or Sa dataset_type = 'S' config = { 'dataset_type': 'v2.3(%s)' % dataset_type, 'verbose': 1, 'label_version': 'v2.0' } train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index( config['label_version']) # print(train_data.head()) train_X = train_data['SENTENCE'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() test_X = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for index, (x, y) in enumerate( data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=3)): cv_x.append(x) cv_y.append(y) # print(y) y = [index_to_label[item] for item in y] cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x}) data_util.save_data( cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' % (dataset_type, index, len(cv_data))) print(len(x)) # quit() feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type=feature_type, max_features=2000, ) all_cv_data = transform_cv_data(feature_encoder, (cv_x, cv_y), (test_X, test_y), verbose=1) counter = 0 for dev_X, dev_y, val_X, val_y in all_cv_data: counter += 1 dev = np.concatenate((dev_y.reshape(-1, 1), dev_X), axis=1) val = np.concatenate((val_y.reshape(-1, 1), val_X), axis=1) print(dev_X.shape) print(len(dev_y)) print(dev.shape) print(val_X.shape) print(len(val_y)) print(val.shape) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv' % (dataset_type, feature_type, counter, len(dev)), dev, fmt='%d', delimiter=',', ) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' % (dataset_type, feature_type, counter, len(val)), val, fmt='%d', delimiter=',', )
def cross_validation(cv_data, test_data, result_file_path, **kwargs): ''' 进行参数的交叉验证 :param cv_data: k份训练数据 :type cv_data: array-like :param test_data: 测试数据 :type test_data: array-like :return: ''' nb_epoch = kwargs['nb_epoch'] verbose = kwargs['verbose'] num_labels = 24 feature_type = kwargs['feature_type'] remove_stopword = kwargs['remove_stopword'] word2vec_to_solve_oov = kwargs['word2vec_to_solve_oov'] rand_seed = kwargs['rand_seed'] l1_conv_filter_type = kwargs['l1_conv_filter_type'] l2_conv_filter_type = kwargs['l2_conv_filter_type'] k = kwargs['k'] # 详细结果保存到... detail_result_file_path = result_file_path fout = open(detail_result_file_path, 'w') print('=' * 150) print( '使用word2vec:%s\nfeature_type:%s\nremove_stopword:%s\nnb_epoch:%d\nrand_seed:%d' % (word2vec_to_solve_oov, feature_type, remove_stopword, nb_epoch, rand_seed)) print('l1_conv_filter_type:%s' % l1_conv_filter_type) print('l2_conv_filter_type:%s' % l2_conv_filter_type) print('k:%s' % k) print('=' * 150) fout.write('=' * 150 + '\n') fout.write('single单通道CNN-bow cv结果:\n') fout.write('feature_type:%s\nnb_epoch:%d\nrand_seed:%d\n' % (feature_type, nb_epoch, rand_seed)) fout.write('l1_conv_filter_type:%s\n' % l1_conv_filter_type) fout.write('l2_conv_filter_type:%s\n' % l2_conv_filter_type) fout.write('k:%s\n' % k) fout.write('=' * 150 + '\n') from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder from data_processing_util.cross_validation_util import transform_cv_data feature_encoder = FeatureEncoder(need_segmented=True, full_mode=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', max_features=2000, **kwargs) all_cv_data = transform_cv_data(feature_encoder, cv_data, test_data, **kwargs) for layer1 in kwargs['layer1']: for layer2 in kwargs['layer2']: for hidden1 in kwargs['hidden1']: for hidden2 in kwargs['hidden2']: print('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d' % (layer1, layer2, hidden1, hidden2)) fout.write('=' * 150 + '\n') fout.write( 'layer1:%d,layer2:%d,hidden1:%d,hidden2:%d\n' % (layer1, layer2, hidden1, hidden2)) # 五折 print('K折交叉验证开始...') counter = 0 test_acc = [] train_acc = [] for dev_X, dev_y, val_X, val_y in all_cv_data: # print(dev_X2.shape) print('-' * 80) fout.write('-' * 80 + '\n') if counter == 0: # 第一个数据是训练,之后是交叉验证 print('训练:') fout.write('训练\n') else: print('第%d个验证' % counter) fout.write('第%d个验证\n' % counter) bow_cnn = SingleChannelBowCNN( rand_seed=rand_seed, verbose=verbose, feature_encoder=None, num_labels=num_labels, input_length=dev_X.shape[1], l1_conv_filter_type=[ # [layer1, l1_conv_filter_type[0], -1, 'valid', (k[0], 1), 0.,'relu', 'none'], # [layer1, l1_conv_filter_type[1], -1, 'valid', (k[0], 1), 0.,'relu', 'none'], # [layer1, l1_conv_filter_type[2], -1, 'valid', (k[0], 1), 0.,'relu', 'batch_normalization'], ], l2_conv_filter_type=[[ layer2, l2_conv_filter_type[0], -1, 'valid', (k[1], 1), 0., 'relu', 'batch_normalization' ]], full_connected_layer_units=[ (hidden1, 0.5, 'relu', 'none'), (hidden2, 0.5, 'relu', 'none') ], nb_epoch=nb_epoch, earlyStoping_patience=50, optimizers='sgd', batch_size=32, lr=1e-2, ) # bow_cnn.print_model_descibe() dev_loss, dev_accuracy, \ val_loss, val_accuracy = bow_cnn.fit((dev_X, dev_y), (val_X, val_y)) print('dev:%f,%f' % (dev_loss, dev_accuracy)) print('val:%f,%f' % (val_loss, val_accuracy)) fout.write('dev:%f,%f\n' % (dev_loss, dev_accuracy)) fout.write('val:%f,%f\n' % (val_loss, val_accuracy)) test_acc.append(val_accuracy) train_acc.append(dev_accuracy) counter += 1 print('k折验证结果:%s' % test_acc) print('验证中平均准确率:%f' % np.average(test_acc[1:])) print('-' * 80) fout.write('k折验证训练结果:%s\n' % train_acc) fout.write('k折验证测试结果:%s\n' % test_acc) fout.write('平均:%f\n' % np.average(test_acc[1:])) fout.write('-' * 80 + '\n') fout.flush() fout.close()
def test_single_bow(): train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['句子,句子', '你好', '你妹'] test_y = [2, 3, 0] # 生成字词组合级别的特征 from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='word_seg', max_features=2000, ) train_X_feature = feature_encoder.fit_transform(train_X) test_X_feature = feature_encoder.transform(test_X) print(feature_encoder.vocabulary_size) print(','.join(feature_encoder.vocabulary)) print(train_X_feature) print(test_X_feature) bow_cnn = SingleChannelBowCNN( rand_seed=1337, verbose=1, feature_encoder=feature_encoder, num_labels=4, input_length=feature_encoder.vocabulary_size, l1_conv_filter_type=[ [5, 2, 1, 'valid', (2, 1), 0.,'relu','batch_normalization'], [5, 4, 1, 'valid', (2, 1), 0.,'relu','batch_normalization'], # [5, 4, 1, 'valid',(-2,1),0.], # [5, 6, 1, 'valid',(-2,1),0.], ], l2_conv_filter_type=[ [3, 2, 1, 'valid', (2, 1), 0.,'relu','none'] ], full_connected_layer_units=[(50,0.25,'relu','none'), (100,0.25,'relu','none')], # full_connected_layer_units=[50, 100], output_dropout_rate=0., nb_epoch=30, earlyStoping_patience=50, optimizers='sgd', batch_size=2, ) bow_cnn.print_model_descibe() # bow_cnn.model_from_pickle('model/AA.pkl') print(bow_cnn.fit( (train_X_feature, trian_y), (test_X_feature, test_y))) print(bow_cnn.predict('你好', transform_input=True)) # print(bow_cnn.get_layer_output(['你好'],layer='conv2', transform_input=True)) print(bow_cnn.get_layer_output(['好'], layer='hidden2',transform_input=True)) # print(bow_cnn.get_layer_output(['好'], layer='batchnormalization',transform_input=True)) bow_cnn.accuracy((test_X_feature, test_y)) print(bow_cnn.batch_predict(test_X, True)) print(bow_cnn.batch_predict(test_X_feature, False))