sentence_padding_length=sentence_padding_length, verbose=1, need_segmented=True, full_mode=full_mode, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, feature_type=feature_type, ) train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix()) feature_encoder.print_model_descibe() feature_encoder.print_sentence_length_detail() # train_y = train_data['LABEL_INDEX'].as_matrix() test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix()) test_all_y = test_data['LABEL_INDEX'].as_matrix() print(train_X_feature.shape) print(test_all_X_feature.shape) logging.debug('=' * 20) # **************************************************************** # ------------- region end : 2. 转换数据的格式并特征编码 ------------- # ****************************************************************
feature_encoder = FeatureEncoder( sentence_padding_length=config['sentence_padding_length'], verbose=0, need_segmented=config['need_segmented'], full_mode=True, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='center', add_unkown_word=True, mask_zero=True, zhs2zht=True, remove_url=True, ) train_X_feature = feature_encoder.fit_transform(train_data=train_X) test_X_feature = map(feature_encoder.transform_sentence, test_X) feature_encoder.print_sentence_length_detail print feature_encoder.vocabulary_size # print ','.join(sorted(feature_encoder.vocabulary)) # quit() feature_encoder.print_model_descibe() # -------------- code start : 结束 ------------- if verbose > 2: logging.debug('-' * 20) print '-' * 20 # -------------- region end : 2. 转换数据格式,以可以进行分类 --------------- for seed in config['rand_seed']:
def test_onehot_bow_cnn(): # 使用样例 train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['句子', '你好', '你妹'] test_y = [2, 3, 0] sentence_padding_length = 8 from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder word_feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type='word', zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, ) train_X_word_feature = word_feature_encoder.fit_transform(train_X) test_X_word_feature = word_feature_encoder.transform(test_X) print(','.join(word_feature_encoder.vocabulary)) print train_X_word_feature.shape print train_X_word_feature seg_feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type='seg', zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, ) train_X_seg_feature = seg_feature_encoder.fit_transform(train_X) test_X_seg_feature = seg_feature_encoder.transform(test_X) print(','.join(seg_feature_encoder.vocabulary)) print train_X_seg_feature.shape print train_X_seg_feature # quit() onehot_cnn = MultiChannelOnehotBowCNN( rand_seed=1377, verbose=1, feature_encoder=(word_feature_encoder,seg_feature_encoder), # optimizers='adadelta', optimizers='sgd', word_input_length=sentence_padding_length, seg_input_length=sentence_padding_length, word_input_dim=word_feature_encoder.vocabulary_size, seg_input_dim=seg_feature_encoder.vocabulary_size, num_labels=5, l1_conv_filter_type=[ [1, 2, -1, 'valid', (0, 1), 0., 'relu', 'none'], [1, 3, -1, 'valid', (0, 1), 0., 'relu', 'none'], [1, -1, -1, 'bow', (0, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[ # [16, 2, -1, 'valid',(2,1),0.5, 'relu', 'none'] ], full_connected_layer_units=[ (50, 0.5, 'relu', 'none'), ], embedding_dropout_rate=0., nb_epoch=30, nb_batch=5, earlyStoping_patience=20, lr=1e-2, ) onehot_cnn.print_model_descibe() # 训练模型 # 从保存的pickle中加载模型 # onehot_cnn.model_from_pickle('model/modelA.pkl') print(onehot_cnn.fit(([train_X_word_feature,train_X_seg_feature], trian_y), ([test_X_word_feature,test_X_seg_feature], test_y))) print(trian_y) # loss, train_accuracy = onehot_cnn.model.evaluate(train_X_feature, trian_y) # onehot_cnn.accuracy((train_X_word_feature, trian_y), transform_input=False) print(onehot_cnn.batch_predict([test_X_word_feature,test_X_seg_feature], transform_input=False)) print(onehot_cnn.batch_predict_bestn([test_X_word_feature,test_X_seg_feature], transform_input=False, bestn=2)) quit() print onehot_cnn.batch_predict(test_X, transform_input=True) print onehot_cnn.predict(test_X[0], transform_input=True) onehot_cnn.accuracy((test_X, test_y), transform_input=True) # 保存模型 # onehot_cnn.save_model('model/modelA.pkl') print onehot_cnn.predict('你好吗', transform_input=True)
test_y = [3, 1, 1] sentence_padding_length = 10 feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, mask_zero=True) train_X_features = feature_encoder.fit_transform(train_data=train_X) print(train_X_features) dcnn = DynamicCNN( rand_seed=1337, verbose=2, batch_size=1, vocab_size=feature_encoder.vocabulary_size, word_embedding_dim=48, # input_length=None, input_length=sentence_padding_length, num_labels=4, conv_filter_type=[ [100, 2, 'full'], [100, 4, 'full'], # [100,6,5,'valid'], ],