def test_single_bow(): train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['句子,句子', '你好', '你妹'] test_y = [2, 3, 0] # 生成字词组合级别的特征 from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='word_seg', max_features=2000, ) train_X_feature = feature_encoder.fit_transform(train_X) test_X_feature = feature_encoder.transform(test_X) print(feature_encoder.vocabulary_size) print(','.join(feature_encoder.vocabulary)) print(train_X_feature) print(test_X_feature) bow_cnn = SingleChannelBowCNN( rand_seed=1337, verbose=1, feature_encoder=feature_encoder, num_labels=4, input_length=feature_encoder.vocabulary_size, l1_conv_filter_type=[ [5, 2, 1, 'valid', (2, 1), 0., 'relu', 'batch_normalization'], [5, 4, 1, 'valid', (2, 1), 0., 'relu', 'batch_normalization'], # [5, 4, 1, 'valid',(-2,1),0.], # [5, 6, 1, 'valid',(-2,1),0.], ], l2_conv_filter_type=[[3, 2, 1, 'valid', (2, 1), 0., 'relu', 'none']], full_connected_layer_units=[(50, 0.25, 'relu', 'none'), (100, 0.25, 'relu', 'none')], # full_connected_layer_units=[50, 100], output_dropout_rate=0., nb_epoch=30, earlyStoping_patience=50, optimizers='sgd', batch_size=2, ) bow_cnn.print_model_descibe() # bow_cnn.model_from_pickle('model/AA.pkl') print(bow_cnn.fit((train_X_feature, trian_y), (test_X_feature, test_y))) print(bow_cnn.predict('你好', transform_input=True)) # print(bow_cnn.get_layer_output(['你好'],layer='conv2', transform_input=True)) print( bow_cnn.get_layer_output(['好'], layer='hidden2', transform_input=True)) # print(bow_cnn.get_layer_output(['好'], layer='batchnormalization',transform_input=True)) bow_cnn.accuracy((test_X_feature, test_y)) print(bow_cnn.batch_predict(test_X, True)) print(bow_cnn.batch_predict(test_X_feature, False))
test_y = [2, 3, 0] feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', max_features=2000, feature_type='seg', ) train_X_feature = feature_encoder.fit_transform(train_X) test_X_feature = feature_encoder.transform(test_X) print(','.join(feature_encoder.vocabulary)) print(train_X_feature) bow_rf = BowRandomForest( rand_seed=1337, verbose=0, n_estimators=200, min_samples_leaf=1, feature_encoder=feature_encoder, word2vec_to_solve_oov=True, word2vec_model_file_path= '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/vector1000000_50dim.gem', )
test_y = [2, 3, 0] # 生成词级别的特征 seg_feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=2000, ) train_seg_X_feature = seg_feature_encoder.fit_transform(train_X) test_seg_X_feature = seg_feature_encoder.transform(test_X) print(seg_feature_encoder.vocabulary_size) print(','.join(seg_feature_encoder.vocabulary)) print(train_seg_X_feature) print(test_seg_X_feature) # 生成字级别的特征 from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder word_feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True,
logging.debug('使用 %s 提取特征向量'%(config['model'])) print('使用 %s 提取特征向量'%(config['model'])) if config['refresh_all_model']: feature_encoder = FeatureEncoder( verbose=config['verbose'], need_segmented=True, full_mode=config['full_mode'], remove_stopword=config['remove_stopword'], replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method=config['model'], max_features=config['max_features'], ) train_X_feature = feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix()) test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) print(train_X_feature) feature_encoder.print_model_descibe() keywords = feature_encoder.vocabulary print ','.join(keywords) print len(keywords) else: feature_encoder = None logging.debug('fit X shape is :%s'%(str(train_X_feature.shape))) print('fit X shape is :%s'%(str(train_X_feature.shape))) print(train_X_feature[0]) # ------------------------------------------------------------------------------ # -------------- region end : 将数据转为特征 -------------
test_y = [2, 3, 0] # 生成词级别的特征 seg_feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=2000, ) train_seg_X_feature = seg_feature_encoder.fit_transform(train_X) test_seg_X_feature = seg_feature_encoder.transform(test_X) print(seg_feature_encoder.vocabulary_size) print(','.join(seg_feature_encoder.vocabulary)) print(train_seg_X_feature) print(test_seg_X_feature) # 生成字级别的特征 from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder word_feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True,
# print rand_indix train_accu = [] test_accu = [] for i in rand_indix: dev_x.extend(list(train_X_10fold[i])) dev_y.extend(list(train_y_10fold[i])) print('train data size:%d'%len(dev_y)) x = dev_x y = dev_y if shuffle_data: # 打乱数据 x = np.random.RandomState(seed).permutation(x) y = np.random.RandomState(seed).permutation(y) # print(dev_y) dev_X_feature = feature_encoder.fit_transform(x) test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) bow_rf = BowRandomForest( # rand_seed=rand_seed, verbose=0, n_estimators=estimators, min_samples_leaf=1, feature_encoder=None, ) bow_rf.fit(train_data=(dev_X_feature, y), validation_data=(test_X_feature, test_y)) _, _, dev_accuracy, _ = bow_rf.accuracy((dev_X_feature, y), False) _, _, val_accuracy, _ = bow_rf.accuracy((test_X_feature, test_y), False)
print('使用 %s 提取特征向量'%(config['model'])) seg_feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=2000, ) train_seg_X_feature = seg_feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix()) test_seg_X_feature = seg_feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) print(seg_feature_encoder.vocabulary_size) print(','.join(seg_feature_encoder.vocabulary)) # print(train_X_feature) # print(test_X_feature) # 生成字级别的特征 word_feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True,
test_y = [2, 3, 0] feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', max_features=2000, feature_type='seg', ) train_X_feature = feature_encoder.fit_transform(train_X) test_X_feature = feature_encoder.transform(test_X) print(','.join(feature_encoder.vocabulary)) print(train_X_feature) bow_rf = BowRandomForest( rand_seed=1337, verbose=0, n_estimators=200, min_samples_leaf=1, feature_encoder=feature_encoder, word2vec_to_solve_oov=True, word2vec_model_file_path='/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/vector1000000_50dim.gem', ) # bow_rf.model_from_pickle('model.pkl')
def test_single_bow(): train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['句子,句子', '你好', '你妹'] test_y = [2, 3, 0] # 生成字词组合级别的特征 from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='word_seg', max_features=2000, ) train_X_feature = feature_encoder.fit_transform(train_X) test_X_feature = feature_encoder.transform(test_X) print(feature_encoder.vocabulary_size) print(','.join(feature_encoder.vocabulary)) print(train_X_feature) print(test_X_feature) bow_cnn = SingleChannelBowCNN( rand_seed=1337, verbose=1, feature_encoder=feature_encoder, num_labels=4, input_length=feature_encoder.vocabulary_size, l1_conv_filter_type=[ [5, 2, 1, 'valid', (2, 1), 0.,'relu','batch_normalization'], [5, 4, 1, 'valid', (2, 1), 0.,'relu','batch_normalization'], # [5, 4, 1, 'valid',(-2,1),0.], # [5, 6, 1, 'valid',(-2,1),0.], ], l2_conv_filter_type=[ [3, 2, 1, 'valid', (2, 1), 0.,'relu','none'] ], full_connected_layer_units=[(50,0.25,'relu','none'), (100,0.25,'relu','none')], # full_connected_layer_units=[50, 100], output_dropout_rate=0., nb_epoch=30, earlyStoping_patience=50, optimizers='sgd', batch_size=2, ) bow_cnn.print_model_descibe() # bow_cnn.model_from_pickle('model/AA.pkl') print(bow_cnn.fit( (train_X_feature, trian_y), (test_X_feature, test_y))) print(bow_cnn.predict('你好', transform_input=True)) # print(bow_cnn.get_layer_output(['你好'],layer='conv2', transform_input=True)) print(bow_cnn.get_layer_output(['好'], layer='hidden2',transform_input=True)) # print(bow_cnn.get_layer_output(['好'], layer='batchnormalization',transform_input=True)) bow_cnn.accuracy((test_X_feature, test_y)) print(bow_cnn.batch_predict(test_X, True)) print(bow_cnn.batch_predict(test_X_feature, False))