Exemplo n.º 1
0
    sentence_padding_length=sentence_padding_length,
    verbose=1,
    need_segmented=True,
    full_mode=full_mode,
    remove_stopword=True,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    padding_mode='center',
    add_unkown_word=True,
    feature_type=feature_type,
)


train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix())
feature_encoder.print_model_descibe()
feature_encoder.print_sentence_length_detail()

# train_y = train_data['LABEL_INDEX'].as_matrix()

test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix())

test_all_y = test_data['LABEL_INDEX'].as_matrix()

print(train_X_feature.shape)
print(test_all_X_feature.shape)
logging.debug('=' * 20)
# ****************************************************************
# ------------- region end : 2. 转换数据的格式并特征编码 -------------
# ****************************************************************
feature_encoder = FeatureEncoder(
                                 sentence_padding_length=config['sentence_padding_length'],
                                 verbose=0,
                                 need_segmented=config['need_segmented'],
                                 full_mode=True,
                                 replace_number=True,
                                 remove_stopword=True,
                                 lowercase=True,
                                 padding_mode='center',
                                 add_unkown_word=True,
                                 mask_zero=True,
                                 zhs2zht=True,
                                 remove_url=True,
                                 )

train_X_feature = feature_encoder.fit_transform(train_data=train_X)
test_X_feature = map(feature_encoder.transform_sentence, test_X)

feature_encoder.print_sentence_length_detail
print feature_encoder.vocabulary_size
# print ','.join(sorted(feature_encoder.vocabulary))
# quit()
feature_encoder.print_model_descibe()
# -------------- code start : 结束 -------------
if verbose > 2:
    logging.debug('-' * 20)
    print '-' * 20
# -------------- region end : 2. 转换数据格式,以可以进行分类 ---------------

for seed in config['rand_seed']:
Exemplo n.º 3
0
def test_onehot_bow_cnn():
    # 使用样例
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['句子', '你好', '你妹']
    test_y = [2, 3, 0]
    sentence_padding_length = 8
    from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
    word_feature_encoder = FeatureEncoder(
        sentence_padding_length=sentence_padding_length,
        verbose=0,
        need_segmented=True,
        full_mode=True,
        replace_number=True,
        remove_stopword=True,
        lowercase=True,
        padding_mode='left',
        add_unkown_word=True,
        feature_type='word',
        zhs2zht=True,
        remove_url=True,
        # 设置为True,输出 onehot array
        to_onehot_array=True,
    )

    train_X_word_feature = word_feature_encoder.fit_transform(train_X)
    test_X_word_feature = word_feature_encoder.transform(test_X)
    print(','.join(word_feature_encoder.vocabulary))
    print train_X_word_feature.shape
    print train_X_word_feature

    seg_feature_encoder = FeatureEncoder(
        sentence_padding_length=sentence_padding_length,
        verbose=0,
        need_segmented=True,
        full_mode=True,
        replace_number=True,
        remove_stopword=True,
        lowercase=True,
        padding_mode='left',
        add_unkown_word=True,
        feature_type='seg',
        zhs2zht=True,
        remove_url=True,
        # 设置为True,输出 onehot array
        to_onehot_array=True,
    )

    train_X_seg_feature = seg_feature_encoder.fit_transform(train_X)
    test_X_seg_feature = seg_feature_encoder.transform(test_X)
    print(','.join(seg_feature_encoder.vocabulary))
    print train_X_seg_feature.shape
    print train_X_seg_feature

    # quit()
    onehot_cnn = MultiChannelOnehotBowCNN(
        rand_seed=1377,
        verbose=1,
        feature_encoder=(word_feature_encoder,seg_feature_encoder),
        # optimizers='adadelta',
        optimizers='sgd',
        word_input_length=sentence_padding_length,
        seg_input_length=sentence_padding_length,
        word_input_dim=word_feature_encoder.vocabulary_size,
        seg_input_dim=seg_feature_encoder.vocabulary_size,
        num_labels=5,
        l1_conv_filter_type=[
            [1, 2, -1, 'valid', (0, 1), 0., 'relu', 'none'],
            [1, 3, -1, 'valid', (0, 1), 0., 'relu', 'none'],
            [1, -1, -1, 'bow', (0, 1), 0., 'relu', 'none'],
        ],
        l2_conv_filter_type=[
            # [16, 2, -1, 'valid',(2,1),0.5, 'relu', 'none']
        ],
        full_connected_layer_units=[
            (50, 0.5, 'relu', 'none'),
        ],
        embedding_dropout_rate=0.,
        nb_epoch=30,
        nb_batch=5,
        earlyStoping_patience=20,
        lr=1e-2,
    )
    onehot_cnn.print_model_descibe()
    # 训练模型
    # 从保存的pickle中加载模型
    # onehot_cnn.model_from_pickle('model/modelA.pkl')
    print(onehot_cnn.fit(([train_X_word_feature,train_X_seg_feature], trian_y),
                   ([test_X_word_feature,test_X_seg_feature], test_y)))
    print(trian_y)
    # loss, train_accuracy = onehot_cnn.model.evaluate(train_X_feature, trian_y)

    # onehot_cnn.accuracy((train_X_word_feature, trian_y), transform_input=False)
    print(onehot_cnn.batch_predict([test_X_word_feature,test_X_seg_feature], transform_input=False))
    print(onehot_cnn.batch_predict_bestn([test_X_word_feature,test_X_seg_feature], transform_input=False, bestn=2))
    quit()
    print onehot_cnn.batch_predict(test_X, transform_input=True)
    print onehot_cnn.predict(test_X[0], transform_input=True)
    onehot_cnn.accuracy((test_X, test_y), transform_input=True)
    # 保存模型
    # onehot_cnn.save_model('model/modelA.pkl')

    print onehot_cnn.predict('你好吗', transform_input=True)
Exemplo n.º 4
0
 test_y = [3, 1, 1]
 sentence_padding_length = 10
 feature_encoder = FeatureEncoder(
     sentence_padding_length=sentence_padding_length,
     verbose=0,
     need_segmented=True,
     full_mode=True,
     remove_stopword=True,
     replace_number=True,
     lowercase=True,
     zhs2zht=True,
     remove_url=True,
     padding_mode='center',
     add_unkown_word=True,
     mask_zero=True)
 train_X_features = feature_encoder.fit_transform(train_data=train_X)
 print(train_X_features)
 dcnn = DynamicCNN(
     rand_seed=1337,
     verbose=2,
     batch_size=1,
     vocab_size=feature_encoder.vocabulary_size,
     word_embedding_dim=48,
     # input_length=None,
     input_length=sentence_padding_length,
     num_labels=4,
     conv_filter_type=[
         [100, 2, 'full'],
         [100, 4, 'full'],
         # [100,6,5,'valid'],
     ],