def test_single_bow():
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['句子,句子', '你好', '你妹']
    test_y = [2, 3, 0]
    # 生成字词组合级别的特征
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type='word_seg',
        max_features=2000,
    )
    train_X_feature = feature_encoder.fit_transform(train_X)
    test_X_feature = feature_encoder.transform(test_X)
    print(feature_encoder.vocabulary_size)
    print(','.join(feature_encoder.vocabulary))
    print(train_X_feature)
    print(test_X_feature)
    bow_cnn = SingleChannelBowCNN(
        rand_seed=1337,
        verbose=1,
        feature_encoder=feature_encoder,
        num_labels=4,
        input_length=feature_encoder.vocabulary_size,
        l1_conv_filter_type=[
            [5, 2, 1, 'valid', (2, 1), 0., 'relu', 'batch_normalization'],
            [5, 4, 1, 'valid', (2, 1), 0., 'relu', 'batch_normalization'],
            # [5, 4, 1, 'valid',(-2,1),0.],
            # [5, 6, 1, 'valid',(-2,1),0.],
        ],
        l2_conv_filter_type=[[3, 2, 1, 'valid', (2, 1), 0., 'relu', 'none']],
        full_connected_layer_units=[(50, 0.25, 'relu', 'none'),
                                    (100, 0.25, 'relu', 'none')],
        # full_connected_layer_units=[50, 100],
        output_dropout_rate=0.,
        nb_epoch=30,
        earlyStoping_patience=50,
        optimizers='sgd',
        batch_size=2,
    )
    bow_cnn.print_model_descibe()
    # bow_cnn.model_from_pickle('model/AA.pkl')
    print(bow_cnn.fit((train_X_feature, trian_y), (test_X_feature, test_y)))
    print(bow_cnn.predict('你好', transform_input=True))
    # print(bow_cnn.get_layer_output(['你好'],layer='conv2', transform_input=True))
    print(
        bow_cnn.get_layer_output(['好'], layer='hidden2', transform_input=True))
    # print(bow_cnn.get_layer_output(['好'], layer='batchnormalization',transform_input=True))

    bow_cnn.accuracy((test_X_feature, test_y))
    print(bow_cnn.batch_predict(test_X, True))
    print(bow_cnn.batch_predict(test_X_feature, False))
Пример #2
0
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        max_features=2000,
        feature_type='seg',
    )
    train_X_feature = feature_encoder.fit_transform(train_X)
    test_X_feature = feature_encoder.transform(test_X)
    print(','.join(feature_encoder.vocabulary))

    print(train_X_feature)

    bow_rf = BowRandomForest(
        rand_seed=1337,
        verbose=0,
        n_estimators=200,
        min_samples_leaf=1,
        feature_encoder=feature_encoder,
        word2vec_to_solve_oov=True,
        word2vec_model_file_path=
        '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/vector1000000_50dim.gem',
    )
    # bow_rf.model_from_pickle('model.pkl')
Пример #3
0
    # 生成词级别的特征
    seg_feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type='seg',
        max_features=2000,
    )
    train_seg_X_feature = seg_feature_encoder.fit_transform(train_X)
    test_seg_X_feature = seg_feature_encoder.transform(test_X)
    print(seg_feature_encoder.vocabulary_size)
    print(','.join(seg_feature_encoder.vocabulary))
    print(train_seg_X_feature)
    print(test_seg_X_feature)

    # 生成字级别的特征
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
    word_feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
Пример #4
0
print('使用 %s 提取特征向量'%(config['model']))
if config['refresh_all_model']:
    feature_encoder = FeatureEncoder(
                                     verbose=config['verbose'],
                                     need_segmented=True,
                                     full_mode=config['full_mode'],
                                     remove_stopword=config['remove_stopword'],
                                     replace_number=True,
                                     lowercase=True,
                                     zhs2zht=True,
                                     remove_url=True,
                                     feature_method=config['model'],
                                     max_features=config['max_features'],
                                     )
    train_X_feature = feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix())
    test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix())
    print(train_X_feature)
    feature_encoder.print_model_descibe()
    keywords = feature_encoder.vocabulary
    print ','.join(keywords)
    print len(keywords)
else:
    feature_encoder = None
logging.debug('fit X shape is :%s'%(str(train_X_feature.shape)))
print('fit X shape is :%s'%(str(train_X_feature.shape)))

print(train_X_feature[0])

# ------------------------------------------------------------------------------
# -------------- region end : 将数据转为特征 -------------
# ------------------------------------------------------------------------------
    # 生成词级别的特征
    seg_feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type='seg',
        max_features=2000,
    )
    train_seg_X_feature = seg_feature_encoder.fit_transform(train_X)
    test_seg_X_feature = seg_feature_encoder.transform(test_X)
    print(seg_feature_encoder.vocabulary_size)
    print(','.join(seg_feature_encoder.vocabulary))
    print(train_seg_X_feature)
    print(test_seg_X_feature)

    # 生成字级别的特征
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
    word_feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
Пример #6
0
seg_feature_encoder = FeatureEncoder(
    verbose=0,
    need_segmented=True,
    full_mode=True,
    remove_stopword=False,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    feature_method='bow',
    feature_type='seg',
    max_features=2000,
)
train_seg_X_feature = seg_feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix())
test_seg_X_feature = seg_feature_encoder.transform(test_data[u'SENTENCE'].as_matrix())
print(seg_feature_encoder.vocabulary_size)
print(','.join(seg_feature_encoder.vocabulary))
# print(train_X_feature)
# print(test_X_feature)

# 生成字级别的特征
word_feature_encoder = FeatureEncoder(
    verbose=0,
    need_segmented=True,
    full_mode=True,
    remove_stopword=True,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
Пример #7
0
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        max_features=2000,
        feature_type='seg',
    )
    train_X_feature = feature_encoder.fit_transform(train_X)
    test_X_feature = feature_encoder.transform(test_X)
    print(','.join(feature_encoder.vocabulary))

    print(train_X_feature)

    bow_rf = BowRandomForest(
        rand_seed=1337,
        verbose=0,
        n_estimators=200,
        min_samples_leaf=1,
        feature_encoder=feature_encoder,
        word2vec_to_solve_oov=True,
        word2vec_model_file_path='/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/vector1000000_50dim.gem',
    )
    # bow_rf.model_from_pickle('model.pkl')
def test_single_bow():
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['句子,句子', '你好', '你妹']
    test_y = [2, 3, 0]
    # 生成字词组合级别的特征
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type='word_seg',
        max_features=2000,
    )
    train_X_feature = feature_encoder.fit_transform(train_X)
    test_X_feature = feature_encoder.transform(test_X)
    print(feature_encoder.vocabulary_size)
    print(','.join(feature_encoder.vocabulary))
    print(train_X_feature)
    print(test_X_feature)
    bow_cnn = SingleChannelBowCNN(
        rand_seed=1337,
        verbose=1,
        feature_encoder=feature_encoder,
        num_labels=4,
        input_length=feature_encoder.vocabulary_size,
        l1_conv_filter_type=[
            [5, 2, 1, 'valid', (2, 1), 0.,'relu','batch_normalization'],
            [5, 4, 1, 'valid', (2, 1), 0.,'relu','batch_normalization'],
            # [5, 4, 1, 'valid',(-2,1),0.],
            # [5, 6, 1, 'valid',(-2,1),0.],
        ],
        l2_conv_filter_type=[
            [3, 2, 1, 'valid', (2, 1), 0.,'relu','none']
        ],
        full_connected_layer_units=[(50,0.25,'relu','none'), (100,0.25,'relu','none')],
        # full_connected_layer_units=[50, 100],
        output_dropout_rate=0.,
        nb_epoch=30,
        earlyStoping_patience=50,
        optimizers='sgd',
        batch_size=2,
    )
    bow_cnn.print_model_descibe()
    # bow_cnn.model_from_pickle('model/AA.pkl')
    print(bow_cnn.fit(
        (train_X_feature, trian_y),
        (test_X_feature, test_y)))
    print(bow_cnn.predict('你好', transform_input=True))
    # print(bow_cnn.get_layer_output(['你好'],layer='conv2', transform_input=True))
    print(bow_cnn.get_layer_output(['好'], layer='hidden2',transform_input=True))
    # print(bow_cnn.get_layer_output(['好'], layer='batchnormalization',transform_input=True))

    bow_cnn.accuracy((test_X_feature, test_y))
    print(bow_cnn.batch_predict(test_X, True))
    print(bow_cnn.batch_predict(test_X_feature, False))