def test_single_bow():
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['句子,句子', '你好', '你妹']
    test_y = [2, 3, 0]
    # 生成字词组合级别的特征
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type='word_seg',
        max_features=2000,
    )
    train_X_feature = feature_encoder.fit_transform(train_X)
    test_X_feature = feature_encoder.transform(test_X)
    print(feature_encoder.vocabulary_size)
    print(','.join(feature_encoder.vocabulary))
    print(train_X_feature)
    print(test_X_feature)
    bow_cnn = SingleChannelBowCNN(
        rand_seed=1337,
        verbose=1,
        feature_encoder=feature_encoder,
        num_labels=4,
        input_length=feature_encoder.vocabulary_size,
        l1_conv_filter_type=[
            [5, 2, 1, 'valid', (2, 1), 0., 'relu', 'batch_normalization'],
            [5, 4, 1, 'valid', (2, 1), 0., 'relu', 'batch_normalization'],
            # [5, 4, 1, 'valid',(-2,1),0.],
            # [5, 6, 1, 'valid',(-2,1),0.],
        ],
        l2_conv_filter_type=[[3, 2, 1, 'valid', (2, 1), 0., 'relu', 'none']],
        full_connected_layer_units=[(50, 0.25, 'relu', 'none'),
                                    (100, 0.25, 'relu', 'none')],
        # full_connected_layer_units=[50, 100],
        output_dropout_rate=0.,
        nb_epoch=30,
        earlyStoping_patience=50,
        optimizers='sgd',
        batch_size=2,
    )
    bow_cnn.print_model_descibe()
    # bow_cnn.model_from_pickle('model/AA.pkl')
    print(bow_cnn.fit((train_X_feature, trian_y), (test_X_feature, test_y)))
    print(bow_cnn.predict('你好', transform_input=True))
    # print(bow_cnn.get_layer_output(['你好'],layer='conv2', transform_input=True))
    print(
        bow_cnn.get_layer_output(['好'], layer='hidden2', transform_input=True))
    # print(bow_cnn.get_layer_output(['好'], layer='batchnormalization',transform_input=True))

    bow_cnn.accuracy((test_X_feature, test_y))
    print(bow_cnn.batch_predict(test_X, True))
    print(bow_cnn.batch_predict(test_X_feature, False))
예제 #2
0
    def get_feature_encoder(**kwargs):
        '''
            返回 该模型的输入 特征编码器

        :param kwargs: 可设置参数 [ full_mode(#,False), feature_type(#,word),verbose(#,0)],word2vec_to_solve_oov[#,False],word2vec_model_file_path[#,None],加*表示必须提供,加#表示可选,不写则默认。

        :return:
        '''

        feature_encoder = FeatureEncoder(
            verbose=kwargs.get('verbose', 0),
            need_segmented=kwargs.get('need_segmented', True),
            full_mode=kwargs.get('full_mode', False),
            replace_number=True,
            remove_stopword=True,
            lowercase=True,
            add_unkown_word=True,
            feature_type=kwargs.get('feature_type', 'word'),
            zhs2zht=True,
            remove_url=True,
            feature_method='bow',
            max_features=2000,
            word2vec_to_solve_oov=kwargs.get('word2vec_to_solve_oov', False),
            word2vec_model_file_path=kwargs.get('word2vec_model_file_path',
                                                None))
        if kwargs.get('verbose', 0) > 0:
            pprint.pprint(kwargs)

        return feature_encoder
예제 #3
0
            # endregion


if __name__ == '__main__':
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['你妹', '句子', '你好']
    test_y = [2, 3, 0]

    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        max_features=2000,
        feature_type='seg',
    )
    train_X_feature = feature_encoder.fit_transform(train_X)
    test_X_feature = feature_encoder.transform(test_X)
    print(','.join(feature_encoder.vocabulary))

    print(train_X_feature)

    bow_rf = BowRandomForest(
        rand_seed=1337,
        verbose=0,
예제 #4
0
        return detail


if __name__ == '__main__':
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['句子,句子', '你好', '你妹']
    test_y = [2, 3, 0]
    # 生成词级别的特征
    seg_feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type='seg',
        max_features=2000,
    )
    train_seg_X_feature = seg_feature_encoder.fit_transform(train_X)
    test_seg_X_feature = seg_feature_encoder.transform(test_X)
    print(seg_feature_encoder.vocabulary_size)
    print(','.join(seg_feature_encoder.vocabulary))
    print(train_seg_X_feature)
    print(test_seg_X_feature)

    # 生成字级别的特征
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
예제 #5
0
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 ---------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('开始生成特征向量...')

logging.debug('使用 %s 提取特征向量'%(config['model']))
print('使用 %s 提取特征向量'%(config['model']))
if config['refresh_all_model']:
    feature_encoder = FeatureEncoder(
                                     verbose=config['verbose'],
                                     need_segmented=True,
                                     full_mode=config['full_mode'],
                                     remove_stopword=config['remove_stopword'],
                                     replace_number=True,
                                     lowercase=True,
                                     zhs2zht=True,
                                     remove_url=True,
                                     feature_method=config['model'],
                                     max_features=config['max_features'],
                                     )
    train_X_feature = feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix())
    test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix())
    print(train_X_feature)
    feature_encoder.print_model_descibe()
    keywords = feature_encoder.vocabulary
    print ','.join(keywords)
    print len(keywords)
else:
    feature_encoder = None
logging.debug('fit X shape is :%s'%(str(train_X_feature.shape)))
# print(train_data.head())

logging.debug('=' * 20)
logging.debug('开始生成特征向量...')
from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
from data_processing_util.cross_validation_util import transform_cv_data

feature_encoder = FeatureEncoder(
    verbose=config['verbose'],
    need_segmented=True,
    full_mode=config['full_mode'],
    remove_stopword=True,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    feature_method=config['model'],
    feature_type=feature_type,
    max_features=config['max_features'],
    word2vec_to_solve_oov=word2vec_to_solve_oov,
    word2vec_model_file_path=config['word2vec_model_file_path'],
)



# print(train_X_feature)
# feature_encoder.print_model_descibe()
# keywords = feature_encoder.vocabulary
# print ','.join(keywords)
# print '字典个数:%d'%len(keywords)
예제 #7
0
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 ---------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('开始生成特征向量...')

logging.debug('使用 %s 提取特征向量'%(config['model']))
print('使用 %s 提取特征向量'%(config['model']))

seg_feature_encoder = FeatureEncoder(
    verbose=0,
    need_segmented=True,
    full_mode=True,
    remove_stopword=False,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    feature_method='bow',
    feature_type='seg',
    max_features=2000,
)
train_seg_X_feature = seg_feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix())
test_seg_X_feature = seg_feature_encoder.transform(test_data[u'SENTENCE'].as_matrix())
print(seg_feature_encoder.vocabulary_size)
print(','.join(seg_feature_encoder.vocabulary))
# print(train_X_feature)
# print(test_X_feature)

# 生成字级别的特征
word_feature_encoder = FeatureEncoder(
예제 #8
0
            # endregion


if __name__ == '__main__':
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['你妹', '句子', '你好']
    test_y = [2, 3, 0]

    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        max_features=2000,
        feature_type='seg',
    )
    train_X_feature = feature_encoder.fit_transform(train_X)
    test_X_feature = feature_encoder.transform(test_X)
    print(','.join(feature_encoder.vocabulary))

    print(train_X_feature)

    bow_rf = BowRandomForest(
        rand_seed=1337,
        verbose=0,
예제 #9
0
def process_train_data_for_k_fold(k=3):
    '''
        将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。
        处理文件: v2.2/v2.2_train_Sa_884.csv
        输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv

    :return:
    '''

    from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
    from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

    data_util = DataUtil()

    feature_type = 'seg'
    # L or Sa
    dataset_type = 'S'

    config = {
        'dataset_type': 'v2.3(%s)' % dataset_type,
        'verbose': 1,
        'label_version': 'v2.0'
    }
    train_data, test_data = data_util.load_train_test_data(config)
    label_to_index, index_to_label = data_util.get_label_index(
        config['label_version'])
    # print(train_data.head())
    train_X = train_data['SENTENCE'].as_matrix()
    train_y = train_data['LABEL_INDEX'].as_matrix()
    test_X = test_data['SENTENCE'].as_matrix()
    test_y = test_data['LABEL_INDEX'].as_matrix()

    cv_x = []
    cv_y = []
    for index, (x, y) in enumerate(
            data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=3)):
        cv_x.append(x)
        cv_y.append(y)
        # print(y)
        y = [index_to_label[item] for item in y]
        cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x})
        data_util.save_data(
            cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' %
            (dataset_type, index, len(cv_data)))
        print(len(x))
    # quit()
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type=feature_type,
        max_features=2000,
    )

    all_cv_data = transform_cv_data(feature_encoder, (cv_x, cv_y),
                                    (test_X, test_y),
                                    verbose=1)

    counter = 0
    for dev_X, dev_y, val_X, val_y in all_cv_data:
        counter += 1
        dev = np.concatenate((dev_y.reshape(-1, 1), dev_X), axis=1)
        val = np.concatenate((val_y.reshape(-1, 1), val_X), axis=1)
        print(dev_X.shape)
        print(len(dev_y))
        print(dev.shape)
        print(val_X.shape)
        print(len(val_y))
        print(val.shape)
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv' %
            (dataset_type, feature_type, counter, len(dev)),
            dev,
            fmt='%d',
            delimiter=',',
        )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' %
            (dataset_type, feature_type, counter, len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )
    def cross_validation(cv_data, test_data, result_file_path, **kwargs):
        '''
            进行参数的交叉验证

        :param cv_data: k份训练数据
        :type cv_data: array-like
        :param test_data: 测试数据
        :type test_data: array-like
        :return:
        '''

        nb_epoch = kwargs['nb_epoch']
        verbose = kwargs['verbose']
        num_labels = 24
        feature_type = kwargs['feature_type']
        remove_stopword = kwargs['remove_stopword']

        word2vec_to_solve_oov = kwargs['word2vec_to_solve_oov']
        rand_seed = kwargs['rand_seed']
        l1_conv_filter_type = kwargs['l1_conv_filter_type']
        l2_conv_filter_type = kwargs['l2_conv_filter_type']
        k = kwargs['k']

        # 详细结果保存到...
        detail_result_file_path = result_file_path
        fout = open(detail_result_file_path, 'w')

        print('=' * 150)

        print(
            '使用word2vec:%s\nfeature_type:%s\nremove_stopword:%s\nnb_epoch:%d\nrand_seed:%d'
            % (word2vec_to_solve_oov, feature_type, remove_stopword, nb_epoch,
               rand_seed))
        print('l1_conv_filter_type:%s' % l1_conv_filter_type)
        print('l2_conv_filter_type:%s' % l2_conv_filter_type)
        print('k:%s' % k)
        print('=' * 150)

        fout.write('=' * 150 + '\n')
        fout.write('single单通道CNN-bow cv结果:\n')
        fout.write('feature_type:%s\nnb_epoch:%d\nrand_seed:%d\n' %
                   (feature_type, nb_epoch, rand_seed))
        fout.write('l1_conv_filter_type:%s\n' % l1_conv_filter_type)
        fout.write('l2_conv_filter_type:%s\n' % l2_conv_filter_type)
        fout.write('k:%s\n' % k)
        fout.write('=' * 150 + '\n')

        from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
        from data_processing_util.cross_validation_util import transform_cv_data
        feature_encoder = FeatureEncoder(need_segmented=True,
                                         full_mode=True,
                                         replace_number=True,
                                         lowercase=True,
                                         zhs2zht=True,
                                         remove_url=True,
                                         feature_method='bow',
                                         max_features=2000,
                                         **kwargs)

        all_cv_data = transform_cv_data(feature_encoder, cv_data, test_data,
                                        **kwargs)

        for layer1 in kwargs['layer1']:
            for layer2 in kwargs['layer2']:
                for hidden1 in kwargs['hidden1']:
                    for hidden2 in kwargs['hidden2']:

                        print('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d' %
                              (layer1, layer2, hidden1, hidden2))

                        fout.write('=' * 150 + '\n')
                        fout.write(
                            'layer1:%d,layer2:%d,hidden1:%d,hidden2:%d\n' %
                            (layer1, layer2, hidden1, hidden2))
                        # 五折
                        print('K折交叉验证开始...')
                        counter = 0
                        test_acc = []
                        train_acc = []
                        for dev_X, dev_y, val_X, val_y in all_cv_data:
                            # print(dev_X2.shape)
                            print('-' * 80)
                            fout.write('-' * 80 + '\n')
                            if counter == 0:
                                # 第一个数据是训练,之后是交叉验证
                                print('训练:')
                                fout.write('训练\n')
                            else:
                                print('第%d个验证' % counter)
                                fout.write('第%d个验证\n' % counter)

                            bow_cnn = SingleChannelBowCNN(
                                rand_seed=rand_seed,
                                verbose=verbose,
                                feature_encoder=None,
                                num_labels=num_labels,
                                input_length=dev_X.shape[1],
                                l1_conv_filter_type=[
                                    # [layer1, l1_conv_filter_type[0], -1, 'valid', (k[0], 1), 0.,'relu', 'none'],
                                    # [layer1, l1_conv_filter_type[1], -1, 'valid', (k[0], 1), 0.,'relu', 'none'],
                                    # [layer1, l1_conv_filter_type[2], -1, 'valid', (k[0], 1), 0.,'relu', 'batch_normalization'],
                                ],
                                l2_conv_filter_type=[[
                                    layer2, l2_conv_filter_type[0], -1,
                                    'valid', (k[1], 1), 0., 'relu',
                                    'batch_normalization'
                                ]],
                                full_connected_layer_units=[
                                    (hidden1, 0.5, 'relu', 'none'),
                                    (hidden2, 0.5, 'relu', 'none')
                                ],
                                nb_epoch=nb_epoch,
                                earlyStoping_patience=50,
                                optimizers='sgd',
                                batch_size=32,
                                lr=1e-2,
                            )

                            # bow_cnn.print_model_descibe()

                            dev_loss, dev_accuracy, \
                            val_loss, val_accuracy = bow_cnn.fit((dev_X, dev_y), (val_X, val_y))

                            print('dev:%f,%f' % (dev_loss, dev_accuracy))
                            print('val:%f,%f' % (val_loss, val_accuracy))
                            fout.write('dev:%f,%f\n' %
                                       (dev_loss, dev_accuracy))
                            fout.write('val:%f,%f\n' %
                                       (val_loss, val_accuracy))
                            test_acc.append(val_accuracy)
                            train_acc.append(dev_accuracy)
                            counter += 1

                        print('k折验证结果:%s' % test_acc)
                        print('验证中平均准确率:%f' % np.average(test_acc[1:]))
                        print('-' * 80)

                        fout.write('k折验证训练结果:%s\n' % train_acc)
                        fout.write('k折验证测试结果:%s\n' % test_acc)
                        fout.write('平均:%f\n' % np.average(test_acc[1:]))
                        fout.write('-' * 80 + '\n')
                        fout.flush()
        fout.close()
def test_single_bow():
    train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机']
    trian_y = [1, 3, 2, 2, 3]
    test_X = ['句子,句子', '你好', '你妹']
    test_y = [2, 3, 0]
    # 生成字词组合级别的特征
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=True,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type='word_seg',
        max_features=2000,
    )
    train_X_feature = feature_encoder.fit_transform(train_X)
    test_X_feature = feature_encoder.transform(test_X)
    print(feature_encoder.vocabulary_size)
    print(','.join(feature_encoder.vocabulary))
    print(train_X_feature)
    print(test_X_feature)
    bow_cnn = SingleChannelBowCNN(
        rand_seed=1337,
        verbose=1,
        feature_encoder=feature_encoder,
        num_labels=4,
        input_length=feature_encoder.vocabulary_size,
        l1_conv_filter_type=[
            [5, 2, 1, 'valid', (2, 1), 0.,'relu','batch_normalization'],
            [5, 4, 1, 'valid', (2, 1), 0.,'relu','batch_normalization'],
            # [5, 4, 1, 'valid',(-2,1),0.],
            # [5, 6, 1, 'valid',(-2,1),0.],
        ],
        l2_conv_filter_type=[
            [3, 2, 1, 'valid', (2, 1), 0.,'relu','none']
        ],
        full_connected_layer_units=[(50,0.25,'relu','none'), (100,0.25,'relu','none')],
        # full_connected_layer_units=[50, 100],
        output_dropout_rate=0.,
        nb_epoch=30,
        earlyStoping_patience=50,
        optimizers='sgd',
        batch_size=2,
    )
    bow_cnn.print_model_descibe()
    # bow_cnn.model_from_pickle('model/AA.pkl')
    print(bow_cnn.fit(
        (train_X_feature, trian_y),
        (test_X_feature, test_y)))
    print(bow_cnn.predict('你好', transform_input=True))
    # print(bow_cnn.get_layer_output(['你好'],layer='conv2', transform_input=True))
    print(bow_cnn.get_layer_output(['好'], layer='hidden2',transform_input=True))
    # print(bow_cnn.get_layer_output(['好'], layer='batchnormalization',transform_input=True))

    bow_cnn.accuracy((test_X_feature, test_y))
    print(bow_cnn.batch_predict(test_X, True))
    print(bow_cnn.batch_predict(test_X_feature, False))