def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        input_length=None,
        bow_num_filter_list=None,
        w2v_num_filter_list=None,
        bow_region_size_list=None,
        verbose=0,
        word2vec_model_file_path=None,
    ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = BowWordEmbeddingMergeCNN.get_feature_encoder(
            input_length=input_length,
            verbose=0,
            feature_type='word',
        )

        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0)
        # 交叉验证
        for bow_num_filter in bow_num_filter_list:
            for bow_region_size in bow_region_size_list:
                for w2v_num_filter in w2v_num_filter_list:

                    print('=' * 40)
                    print(
                        'bow_num_filter,bow_region_size and w2v_num_filter is %d,%d,%d.'
                        % (bow_num_filter, bow_region_size, w2v_num_filter))
                    get_val_score(
                        BowWordEmbeddingMergeCNNWithOneConv,
                        cv_data=cv_data,
                        verbose=verbose,
                        bow_num_filter=bow_num_filter,
                        bow_region_size=bow_region_size,
                        w2v_num_filter=w2v_num_filter,
                        num_labels=24,
                        word2vec_model_file_path=word2vec_model_file_path,
                    )
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            input_length =None,
            bow_num_filter_list=None,
            w2v_num_filter_list=None,
            bow_region_size_list = None,
            verbose = 0,
            word2vec_model_file_path = None,
           ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = BowWordEmbeddingMergeCNN.get_feature_encoder(
            input_length=input_length,
            verbose=0,
            feature_type='word',
        )


        cv_data = transform_cv_data(feature_encoder, cv_data,verbose=0)
        # 交叉验证
        for bow_num_filter in bow_num_filter_list:
            for bow_region_size in bow_region_size_list:
                for w2v_num_filter in w2v_num_filter_list:

                    print('=' * 40)
                    print('bow_num_filter,bow_region_size and w2v_num_filter is %d,%d,%d.'%(bow_num_filter,bow_region_size,w2v_num_filter))
                    get_val_score(BowWordEmbeddingMergeCNNWithOneConv,
                                  cv_data=cv_data,
                                  verbose=verbose,
                                  bow_num_filter = bow_num_filter,
                                  bow_region_size = bow_region_size,
                                  w2v_num_filter = w2v_num_filter,
                                  num_labels=24,
                                  word2vec_model_file_path = word2vec_model_file_path,
                                  )
示例#3
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        input_length=None,
        conv1_num_filter_list=None,
        conv2_num_filter_list=None,
        verbose=0,
        word2vec_model_file_path=None,
    ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = DCNN.get_feature_encoder(
            input_length=input_length,
            verbose=0,
            full_mode=False,
            feature_type='word',
        )
        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0)
        # 交叉验证
        for conv1_num_filter in conv1_num_filter_list:
            for conv2_num_filter in conv2_num_filter_list:
                print('=' * 40)
                print('num_filter of conv1 and conv2 is %d,%d .' %
                      (conv1_num_filter, conv2_num_filter))
                get_val_score(
                    DcnnAcl,
                    cv_data=cv_data,
                    verbose=verbose,
                    conv1_num_filter=conv1_num_filter,
                    conv2_num_filter=conv2_num_filter,
                    num_labels=24,
                    word2vec_model_file_path=word2vec_model_file_path,
                )
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            input_length =None,
            feature_type = 'word',
            num_filter_list=None,
            region_size_list=None,
            word2vec_to_solve_oov = False,
            word2vec_model_file_path = None,
            verbose = 0,
           ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = OnehotBowCNN.get_feature_encoder(
            input_length=input_length,
            verbose=verbose,
            feature_type=feature_type,
            word2vec_to_solve_oov = word2vec_to_solve_oov,
            word2vec_model_file_path=word2vec_model_file_path,
        )
        cv_data = transform_cv_data(feature_encoder, cv_data,verbose=0)
        # 交叉验证
        for num_filter in num_filter_list:
            for region_size in region_size_list:
                print('=' * 40)
                print('num_filter and region_size is %d,%d.'%(num_filter,region_size))
                get_val_score(OnehotCNNWithOneConv,
                              cv_data=cv_data,
                              verbose=verbose,
                              region_size = region_size,
                              num_filter=num_filter,
                              num_labels=24
                              )
示例#5
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        input_length=None,
        feature_type='word',
        num_filter_list=None,
        region_size_list=None,
        word2vec_to_solve_oov=False,
        word2vec_model_file_path=None,
        verbose=0,
    ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = OnehotBowCNN.get_feature_encoder(
            input_length=input_length,
            verbose=verbose,
            feature_type=feature_type,
            word2vec_to_solve_oov=word2vec_to_solve_oov,
            word2vec_model_file_path=word2vec_model_file_path,
        )
        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0)
        # 交叉验证
        for num_filter in num_filter_list:
            for region_size in region_size_list:
                print('=' * 40)
                print('num_filter and region_size is %d,%d.' %
                      (num_filter, region_size))
                get_val_score(OnehotCNNWithOneConv,
                              cv_data=cv_data,
                              verbose=verbose,
                              region_size=region_size,
                              num_filter=num_filter,
                              num_labels=24)
示例#6
0
# +++++++++++++ region start : 3. 模型的训练 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('3. 模型的训练')


from deep_learning.cnn.wordEmbedding_cnn.wordEmbedding_cnn_model import WordEmbeddingCNN

cv_data = data_util.get_k_fold_data(k=3,
                                    data=train_data,
                                    rand_seed=0,
                                    )
config['word2vec_model_file_path'] = word2vec_model_file_path
all_cv_data = transform_cv_data(
    feature_encoder,
    cv_data,
    (test_data[u'SENTENCE'].as_matrix(), test_all_y),
    to_embedding_weight =True,
    **config)
train_X_feature, train_y, test_X_feature, test_y,init_weight = all_cv_data[0]

for seed in config['rand_seed']:
    print('=' * 150)
    print('rand seed:%d' % seed)
    print('=' * 150)

    # 设置文件地址
    # model_file_path = ''.join([str(item) for item in config['model_file_path']])
    # model_file_path = model_file_path % seed

    # result_file_path = ''.join([str(item) for item in config['result_file_path']])
    # result_file_path = result_file_path % seed
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        feature_type='word',
        input_length=None,
        num_filter_list=None,
        verbose=0,
        cv=3,
        batch_size=32,
        lr=1e-2,
        need_segmented=True,
        word2vec_model_file_path=None,
        num_labels=24,
        embedding_weight_trainable=False,
        shuffle_data=True,
        rand_weight=False,
        need_validation=True,
        include_train_data=True,
        vocabulary_including_test_set=True,
        n_estimators_list=None,
    ):

        print('=' * 80)
        print(
            'feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s'
            % (feature_type, need_segmented, vocabulary_including_test_set))
        print('rand_weight:%s,embedding_weight_trainable:%s' %
              (rand_weight, embedding_weight_trainable))
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder,
                                    cv_data,
                                    verbose=verbose,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            for n_estimators in n_estimators_list:
                print('=' * 40)
                print('num_filter and n_estimators is %d,%d.' %
                      (num_filter, n_estimators))
                get_val_score(
                    RFAndRFAndWordEmbeddingCnnMerge,
                    num_filter=num_filter,
                    n_estimators=n_estimators,
                    cv_data=cv_data[:],
                    verbose=verbose,
                    num_labels=num_labels,
                    word2vec_model_file_path=word2vec_model_file_path,
                    embedding_weight_trainable=embedding_weight_trainable,
                    need_validation=need_validation,
                    rand_weight=rand_weight,
                    batch_size=batch_size,
                    lr=lr,
                    shuffle_data=shuffle_data,
                )
示例#8
0
    def cross_validation(cv_data, test_data, result_file_path, **kwargs):
        """
            进行参数的交叉验证

        :param cv_data: k份训练数据
        :type cv_data: array-like
        :param test_data: 测试数据
        :type test_data: array-like
        :return:
        """

        nb_epoch = kwargs['nb_epoch']
        verbose = kwargs['verbose']
        num_labels = kwargs['num_labels']
        word_input_length, seg_input_length = 10, 7
        remove_stopword = kwargs['remove_stopword']
        word2vec_to_solve_oov = kwargs['word2vec_to_solve_oov']
        rand_seed = kwargs['rand_seed']
        l1_conv_filter_type = kwargs['l1_conv_filter_type']
        l2_conv_filter_type = kwargs['l2_conv_filter_type']
        k = kwargs['k']
        lr = kwargs['lr']

        use_layer = kwargs['use_layer']

        layer1 = kwargs['layer1'] if kwargs.get('layer1', []) !=[] else [-1]
        layer2 = kwargs['layer2'] if kwargs.get('layer2', []) !=[] else [-1]
        hidden1 = kwargs['hidden1'] if kwargs.get('hidden1', []) !=[] else [-1]
        hidden2 = kwargs['hidden2'] if kwargs.get('hidden2', []) !=[] else [-1]

        # 详细结果保存到...
        detail_result_file_path = result_file_path
        fout = open(detail_result_file_path, 'w')
        print('=' * 150)
        print('调节的参数....')
        print('use_layer:%s'%use_layer)
        print('layer1:%s' % str(layer1))
        print('layer2:%s' % str(layer2))
        print('hidden1:%s' % str(hidden1))
        print('hidden2:%s' % str(hidden2))
        print('-' * 150)
        print('word_input_length:%d\nseg_input_length:%d' % (word_input_length, seg_input_length))
        print('使用word2vec:%s\nremove_stopword:%s\nnb_epoch:%d\nrand_seed:%d' % (
            word2vec_to_solve_oov, remove_stopword, nb_epoch, rand_seed))
        print('l1_conv_filter_type:%s' % l1_conv_filter_type)
        print('l2_conv_filter_type:%s' % l2_conv_filter_type)
        print('k:%s' % k)
        print('=' * 150)

        fout.write('=' * 150 + '\n')
        fout.write('cv结果:\n')
        fout.write('lr:%f\nnb_epoch:%d\nrand_seed:%d\n' % (lr,nb_epoch, rand_seed))
        fout.write('l1_conv_filter_type:%s\n' % l1_conv_filter_type)
        fout.write('l2_conv_filter_type:%s\n' % l2_conv_filter_type)
        fout.write('k:%s\n' % k)
        fout.write('=' * 150 + '\n')

        from data_processing_util.cross_validation_util import transform_cv_data,get_val_score
        word_feature_encoder,seg_feature_encoder = MultiChannelOnehotBowCNN.get_feature_encoder(
           ** {'word_input_length':word_input_length,
             'seg_input_length':seg_input_length}
        )


        all_cv_word_data = transform_cv_data(word_feature_encoder, cv_data, test_data, **kwargs)
        all_cv_seg_data = transform_cv_data(seg_feature_encoder, cv_data, test_data, **kwargs)
        cv_data = [([dev_word_X,dev_seg_X],dev_y,[val_word_X,val_seg_X],val_y,(word_feature_encoder,seg_feature_encoder)) for (dev_word_X, dev_y, val_word_X, val_y,word_feature_encoder),(dev_seg_X, dev_y, val_seg_X, val_y,seg_feature_encoder) in zip(all_cv_word_data,all_cv_seg_data)]

        # 交叉验证
        parmater = product(layer1, layer2, hidden1, hidden2)

        for l1,l2,h1,h2 in parmater:

            fout.write('=' * 150 + '\n')
            fout.write('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d\n' % (l1, l2, h1, h2))
            print('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d' % (l1,l2,h1,h2))

            l1_conv_filter =[]
            if 'conv1' in use_layer:
                l1_conv_filter.extend([
                    [l1, l1_conv_filter_type[0][0], -1, l1_conv_filter_type[0][1], (0, 1), 0., 'relu', 'none'],
                    [l1, l1_conv_filter_type[1][0], -1, l1_conv_filter_type[1][1], (0, 1), 0., 'relu', 'none'],
                    [l1, l1_conv_filter_type[2][0], -1, l1_conv_filter_type[2][1], (0, 1), 0., 'relu', 'none'],
                ])

            full_connected_layer_units = []

            if 'hidden1' in use_layer:
                full_connected_layer_units.append([h1, 0., 'relu', 'none'])

            parm = {'l1_conv_filter_type':l1_conv_filter,
                    'full_connected_layer_units':full_connected_layer_units,
                    'num_labels':num_labels,
                    'verbose':verbose,
                    'nb_epoch':nb_epoch,
                    'lr':lr
                    }
            get_val_score(MultiChannelOnehotBowCNN,cv_data,fout,**parm)



        fout.close()
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            feature_type='word',
            input_length=None,
            num_filter_list=None,
            verbose=0,
            cv=3,
            batch_size=32,
            lr=1e-2,
            need_segmented=True,
            word2vec_model_file_path=None,
            num_labels=24,
            embedding_weight_trainable=False,
            shuffle_data=True,
            rand_weight=False,
            need_validation=True,
            include_train_data=True,
            vocabulary_including_test_set=True,
            n_estimators_list=None,
    ):

        print('=' * 80)
        print('feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s' % (feature_type,
                                                                                      need_segmented,
                                                                                      vocabulary_including_test_set))
        print('rand_weight:%s,embedding_weight_trainable:%s' % (rand_weight, embedding_weight_trainable))
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = RFAndWordEmbeddingCnnMerge.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )
        # 转换数据
        # diff_train_val_feature_encoder=1 ----> 训练集和验证集的 feature_encoder 字典 强制不一样。
        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, shuffle_data=shuffle_data,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            for n_estimators in n_estimators_list:
                print('=' * 40)
                print('num_filter and n_estimators is %d,%d.' % (num_filter, n_estimators))
                get_val_score(RFAndRFAndWordEmbeddingCnnMerge,
                              num_filter=num_filter,
                              n_estimators=n_estimators,
                              cv_data=cv_data[:],
                              verbose=verbose,
                              num_labels=num_labels,
                              word2vec_model_file_path=word2vec_model_file_path,
                              embedding_weight_trainable=embedding_weight_trainable,
                              need_validation=need_validation,
                              rand_weight=rand_weight,
                              batch_size=batch_size,
                              lr=lr,
                              )
示例#10
0
    need_segmented = True
    feature_encoder = RFAndWordEmbeddingCnnMerge.get_feature_encoder(
        need_segmented=need_segmented,
        input_length=input_length,
        verbose=1,
        feature_type=feature_type,
        # 设置字典保持一致
        update_dictionary=False,
        vocabulary_including_test_set=True,
    )

    # 将数据集进行特征转换
    cv_data = transform_cv_data(
        feature_encoder=feature_encoder,
        cv_data=[[0, train_X, train_y, test_X, test_y]],
        shuffle_data=True,
        diff_train_val_feature_encoder=1,
        verbose=1,
    )[0]
    # 构建RF(CNN(static-w2v))模型
    model = RFAndWordEmbeddingCnnMerge(
        feature_encoder,
        num_filter=110,
        num_labels=24,
        n_estimators=500,
        word2vec_model_file_path=word2vec_model_file_path,
        dataset_flag=0,
        verbose=0,
        init_model=True,
    )
示例#11
0
def process_train_data_for_k_fold(k=3):
    '''
        将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。
        处理文件: v2.2/v2.2_train_Sa_884.csv
        输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv

    :return:
    '''

    from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
    from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

    data_util = DataUtil()

    feature_type = 'seg'
    # L or Sa
    dataset_type = 'S'

    config = {'dataset_type':'v2.3(%s)'%dataset_type,
              'verbose':1,
              'label_version':'v2.0'
              }
    train_data, test_data = data_util.load_train_test_data(config)
    label_to_index, index_to_label = data_util.get_label_index(config['label_version'])
    # print(train_data.head())
    train_X = train_data['SENTENCE'].as_matrix()
    train_y = train_data['LABEL_INDEX'].as_matrix()
    test_X = test_data['SENTENCE'].as_matrix()
    test_y = test_data['LABEL_INDEX'].as_matrix()

    cv_x = []
    cv_y = []
    for index,(x,y) in enumerate(data_split_k_fold(k=k,data=(train_X,train_y),rand_seed=3)):
        cv_x.append(x)
        cv_y.append(y)
        # print(y)
        y = [index_to_label[item] for item in y]
        cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x})
        data_util.save_data(cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' % (dataset_type,index, len(cv_data)))
        print(len(x))
    # quit()
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type=feature_type,
        max_features=2000,
    )

    all_cv_data = transform_cv_data(feature_encoder, (cv_x,cv_y), (test_X,test_y),verbose=1)

    counter = 0
    for dev_X, dev_y, val_X, val_y in all_cv_data:
        counter+=1
        dev = np.concatenate((dev_y.reshape(-1,1),dev_X),axis=1)
        val = np.concatenate((val_y.reshape(-1,1),val_X),axis=1)
        print(dev_X.shape)
        print(len(dev_y))
        print(dev.shape)
        print(val_X.shape)
        print(len(val_y))
        print(val.shape)
        np.savetxt('result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv'%(
            dataset_type,
            feature_type,
            counter,len(dev)),
                   dev,
                   fmt='%d',
                   delimiter=',',
                   )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv'%(dataset_type,
                                                        feature_type,
                                                        counter,
                                                        len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )
示例#12
0
def process_train_data_for_k_fold(k=3):
    '''
        将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。
        处理文件: v2.2/v2.2_train_Sa_884.csv
        输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv

    :return:
    '''

    from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
    from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

    data_util = DataUtil()

    feature_type = 'seg'
    # L or Sa
    dataset_type = 'S'

    config = {
        'dataset_type': 'v2.3(%s)' % dataset_type,
        'verbose': 1,
        'label_version': 'v2.0'
    }
    train_data, test_data = data_util.load_train_test_data(config)
    label_to_index, index_to_label = data_util.get_label_index(
        config['label_version'])
    # print(train_data.head())
    train_X = train_data['SENTENCE'].as_matrix()
    train_y = train_data['LABEL_INDEX'].as_matrix()
    test_X = test_data['SENTENCE'].as_matrix()
    test_y = test_data['LABEL_INDEX'].as_matrix()

    cv_x = []
    cv_y = []
    for index, (x, y) in enumerate(
            data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=3)):
        cv_x.append(x)
        cv_y.append(y)
        # print(y)
        y = [index_to_label[item] for item in y]
        cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x})
        data_util.save_data(
            cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' %
            (dataset_type, index, len(cv_data)))
        print(len(x))
    # quit()
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type=feature_type,
        max_features=2000,
    )

    all_cv_data = transform_cv_data(feature_encoder, (cv_x, cv_y),
                                    (test_X, test_y),
                                    verbose=1)

    counter = 0
    for dev_X, dev_y, val_X, val_y in all_cv_data:
        counter += 1
        dev = np.concatenate((dev_y.reshape(-1, 1), dev_X), axis=1)
        val = np.concatenate((val_y.reshape(-1, 1), val_X), axis=1)
        print(dev_X.shape)
        print(len(dev_y))
        print(dev.shape)
        print(val_X.shape)
        print(len(val_y))
        print(val.shape)
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv' %
            (dataset_type, feature_type, counter, len(dev)),
            dev,
            fmt='%d',
            delimiter=',',
        )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' %
            (dataset_type, feature_type, counter, len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )
示例#13
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        feature_type='word',
        input_length=None,
        num_filter_list=None,
        verbose=0,
        cv=3,
        batch_size=32,
        lr=1e-2,
        need_segmented=True,
        word2vec_model_file_path=None,
        num_labels=24,
        embedding_weight_trainable=False,
        # 获取中间层输出
        get_cnn_middle_layer_output=False,
        middle_layer_output_file=None,
        rand_weight=False,
        need_validation=True,
        include_train_data=True,
        vocabulary_including_test_set=True,
    ):
        """

        Parameters
        ----------
        train_data : array-like
            训练数据 (train_X, train_y))
        test_data : array-like
            测试数据
        cv_data : array-like
            k份验证数据
        input_length : int
            输入长度
        num_filter_list : array-like
            验证参数,number of filters
        middle_layer_output_file : str
            中间层输出到哪个文件
        get_cnn_middle_layer_output : bool
            是否获取中间层输出(#,False)
        num_labels: int
            标签
        batch_size : int
            batch size
        vocabulary_including_test_set: bool,default,True
            字典是否包括测试集
        include_train_data : bool
            是否包含训练数据一样验证
        need_validation: bool
            是否要验证
        embedding_weight_trainable : bool
            切换 CNN(static-w2v) 和 CNN(non-static-w2v)
        rand_weight : bool
            切换 CNN(rand) or CNN(static/non-static-w2v)
        feature_type : str
            特征类型
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_segmented:bool
            是否需要分词
        word2vec_model_file_path

        Notes
        ----------
        - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典
        - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同

        Examples
        ----------
        >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊']
        >>> train_y = [1, 2, 3, 2, 3]
        >>> test_x = ['你好', '不错哟']
        >>> test_y = [1, 2]
        >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']]
        >>> cv_y = [[1, 3], [2, 2], [3]]
        >>> WordEmbeddingCNNWithOneConv.cross_validation(
        >>>         train_data = (train_x,train_y),
        >>>         test_data=(test_x,test_y),
        >>>         input_length=8,
        >>>         num_filter_list=[5,50],
        >>>         verbose=1,
        >>>         word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem',
        >>>     )

        """
        print('=' * 80)
        print(
            'feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s'
            % (feature_type, need_segmented, vocabulary_including_test_set))
        print('input_length: %d, num_labels: %d' % (input_length, num_labels))
        print(
            'lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s'
            % (lr, batch_size, rand_weight, embedding_weight_trainable))
        if not rand_weight:
            print('W2V model file_path: %s' % word2vec_model_file_path)
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            padding_mode='center',
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder,
                                    cv_data,
                                    verbose=verbose,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            print('=' * 40)
            print('num_filter is %d.' % num_filter)
            _, _, middle_output_dev, middle_output_val = get_val_score(
                WordEmbeddingCNNWithOneConv,
                cv_data=cv_data[:],
                verbose=verbose,
                num_filter=num_filter,
                num_labels=num_labels,
                word2vec_model_file_path=word2vec_model_file_path,
                embedding_weight_trainable=embedding_weight_trainable,
                get_cnn_middle_layer_output=get_cnn_middle_layer_output,
                need_validation=need_validation,
                rand_weight=rand_weight,
                batch_size=batch_size,
                lr=lr,
            )

            if get_cnn_middle_layer_output:
                # 保存结果
                with open(middle_layer_output_file, 'w') as fout:
                    # 保存中间结果
                    pickle.dump(cv_data, fout)
                    pickle.dump(middle_output_dev, fout)
                    pickle.dump(middle_output_val, fout)
示例#14
0
# ------------------------------------------------------------------------------
# -------------- region start : 生成CNN深度特征编码器 -------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('生成CNN深度特征器')

from deep_learning.cnn.bow_cnn.single_channel_bow_cnn_model import SingleChannelBowCNN

cv_data = data_util.get_k_fold_data(k=3,
                                    data=train_data,
                                    rand_seed=3,
                                    )
all_cv_data = transform_cv_data(
    feature_encoder,
    cv_data,
    (test_data[u'SENTENCE'].as_matrix(), test_y),
    **config)
train_X_feature, train_y, test_X_feature, test_y,_,_ = all_cv_data[0]

bow_cnn = SingleChannelBowCNN(
    rand_seed=rand_seed,
    verbose=1,
    feature_encoder=feature_encoder,
    num_labels=len(index_to_label),
    input_length=train_X_feature.shape[-1],
    l1_conv_filter_type=[
        # [layer1, l1_conv_filter_type[0], -1, 'valid', (k[0], 1), 0., 'relu', 'none'],
        # [layer1, l1_conv_filter_type[1], -1, 'valid', (k[0], 1), 0., 'relu', 'batch_normalization'],
        # [layer1, l1_conv_filter_type[2], -1, 'valid', (k[0], 1), 0., 'relu', 'batch_normalization'],
    ],
    def cross_validation(cv_data, test_data, result_file_path, **kwargs):
        '''
            进行参数的交叉验证

        :param cv_data: k份训练数据
        :type cv_data: array-like
        :param test_data: 测试数据
        :type test_data: array-like
        :return:
        '''

        nb_epoch = kwargs['nb_epoch']
        verbose = kwargs['verbose']
        num_labels = 24
        feature_type = kwargs['feature_type']
        remove_stopword = kwargs['remove_stopword']

        word2vec_to_solve_oov = kwargs['word2vec_to_solve_oov']
        rand_seed = kwargs['rand_seed']
        l1_conv_filter_type = kwargs['l1_conv_filter_type']
        l2_conv_filter_type = kwargs['l2_conv_filter_type']
        k = kwargs['k']

        # 详细结果保存到...
        detail_result_file_path = result_file_path
        fout = open(detail_result_file_path, 'w')

        print('=' * 150)

        print(
            '使用word2vec:%s\nfeature_type:%s\nremove_stopword:%s\nnb_epoch:%d\nrand_seed:%d'
            % (word2vec_to_solve_oov, feature_type, remove_stopword, nb_epoch,
               rand_seed))
        print('l1_conv_filter_type:%s' % l1_conv_filter_type)
        print('l2_conv_filter_type:%s' % l2_conv_filter_type)
        print('k:%s' % k)
        print('=' * 150)

        fout.write('=' * 150 + '\n')
        fout.write('single单通道CNN-bow cv结果:\n')
        fout.write('feature_type:%s\nnb_epoch:%d\nrand_seed:%d\n' %
                   (feature_type, nb_epoch, rand_seed))
        fout.write('l1_conv_filter_type:%s\n' % l1_conv_filter_type)
        fout.write('l2_conv_filter_type:%s\n' % l2_conv_filter_type)
        fout.write('k:%s\n' % k)
        fout.write('=' * 150 + '\n')

        from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
        from data_processing_util.cross_validation_util import transform_cv_data
        feature_encoder = FeatureEncoder(need_segmented=True,
                                         full_mode=True,
                                         replace_number=True,
                                         lowercase=True,
                                         zhs2zht=True,
                                         remove_url=True,
                                         feature_method='bow',
                                         max_features=2000,
                                         **kwargs)

        all_cv_data = transform_cv_data(feature_encoder, cv_data, test_data,
                                        **kwargs)

        for layer1 in kwargs['layer1']:
            for layer2 in kwargs['layer2']:
                for hidden1 in kwargs['hidden1']:
                    for hidden2 in kwargs['hidden2']:

                        print('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d' %
                              (layer1, layer2, hidden1, hidden2))

                        fout.write('=' * 150 + '\n')
                        fout.write(
                            'layer1:%d,layer2:%d,hidden1:%d,hidden2:%d\n' %
                            (layer1, layer2, hidden1, hidden2))
                        # 五折
                        print('K折交叉验证开始...')
                        counter = 0
                        test_acc = []
                        train_acc = []
                        for dev_X, dev_y, val_X, val_y in all_cv_data:
                            # print(dev_X2.shape)
                            print('-' * 80)
                            fout.write('-' * 80 + '\n')
                            if counter == 0:
                                # 第一个数据是训练,之后是交叉验证
                                print('训练:')
                                fout.write('训练\n')
                            else:
                                print('第%d个验证' % counter)
                                fout.write('第%d个验证\n' % counter)

                            bow_cnn = SingleChannelBowCNN(
                                rand_seed=rand_seed,
                                verbose=verbose,
                                feature_encoder=None,
                                num_labels=num_labels,
                                input_length=dev_X.shape[1],
                                l1_conv_filter_type=[
                                    # [layer1, l1_conv_filter_type[0], -1, 'valid', (k[0], 1), 0.,'relu', 'none'],
                                    # [layer1, l1_conv_filter_type[1], -1, 'valid', (k[0], 1), 0.,'relu', 'none'],
                                    # [layer1, l1_conv_filter_type[2], -1, 'valid', (k[0], 1), 0.,'relu', 'batch_normalization'],
                                ],
                                l2_conv_filter_type=[[
                                    layer2, l2_conv_filter_type[0], -1,
                                    'valid', (k[1], 1), 0., 'relu',
                                    'batch_normalization'
                                ]],
                                full_connected_layer_units=[
                                    (hidden1, 0.5, 'relu', 'none'),
                                    (hidden2, 0.5, 'relu', 'none')
                                ],
                                nb_epoch=nb_epoch,
                                earlyStoping_patience=50,
                                optimizers='sgd',
                                batch_size=32,
                                lr=1e-2,
                            )

                            # bow_cnn.print_model_descibe()

                            dev_loss, dev_accuracy, \
                            val_loss, val_accuracy = bow_cnn.fit((dev_X, dev_y), (val_X, val_y))

                            print('dev:%f,%f' % (dev_loss, dev_accuracy))
                            print('val:%f,%f' % (val_loss, val_accuracy))
                            fout.write('dev:%f,%f\n' %
                                       (dev_loss, dev_accuracy))
                            fout.write('val:%f,%f\n' %
                                       (val_loss, val_accuracy))
                            test_acc.append(val_accuracy)
                            train_acc.append(dev_accuracy)
                            counter += 1

                        print('k折验证结果:%s' % test_acc)
                        print('验证中平均准确率:%f' % np.average(test_acc[1:]))
                        print('-' * 80)

                        fout.write('k折验证训练结果:%s\n' % train_acc)
                        fout.write('k折验证测试结果:%s\n' % test_acc)
                        fout.write('平均:%f\n' % np.average(test_acc[1:]))
                        fout.write('-' * 80 + '\n')
                        fout.flush()
        fout.close()
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            feature_type='word',
            input_length=None,
            num_filter_list=None,
            verbose=0,
            cv=3,
            batch_size=32,
            lr=1e-2,
            need_segmented=True,
            word2vec_model_file_path=None,
            num_labels=24,
            embedding_weight_trainable=False,
            # 获取中间层输出
            get_cnn_middle_layer_output=False,
            middle_layer_output_file=None,
            rand_weight=False,
            need_validation=True,
            include_train_data=True,
            vocabulary_including_test_set=True,
    ):
        """

        Parameters
        ----------
        train_data : array-like
            训练数据 (train_X, train_y))
        test_data : array-like
            测试数据
        cv_data : array-like
            k份验证数据
        input_length : int
            输入长度
        num_filter_list : array-like
            验证参数,number of filters
        middle_layer_output_file : str
            中间层输出到哪个文件
        get_cnn_middle_layer_output : bool
            是否获取中间层输出(#,False)
        num_labels: int
            标签
        batch_size : int
            batch size
        vocabulary_including_test_set: bool,default,True
            字典是否包括测试集
        include_train_data : bool
            是否包含训练数据一样验证
        need_validation: bool
            是否要验证
        embedding_weight_trainable : bool
            切换 CNN(static-w2v) 和 CNN(non-static-w2v)
        rand_weight : bool
            切换 CNN(rand) or CNN(static/non-static-w2v)
        feature_type : str
            特征类型
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_segmented:bool
            是否需要分词
        word2vec_model_file_path

        Notes
        ----------
        - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典
        - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同

        Examples
        ----------
        >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊']
        >>> train_y = [1, 2, 3, 2, 3]
        >>> test_x = ['你好', '不错哟']
        >>> test_y = [1, 2]
        >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']]
        >>> cv_y = [[1, 3], [2, 2], [3]]
        >>> WordEmbeddingCNNWithOneConv.cross_validation(
        >>>         train_data = (train_x,train_y),
        >>>         test_data=(test_x,test_y),
        >>>         input_length=8,
        >>>         num_filter_list=[5,50],
        >>>         verbose=1,
        >>>         word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem',
        >>>     )

        """
        print('=' * 80)
        print('feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type,
                                                                                      need_segmented,
                                                                                      vocabulary_including_test_set))
        print('input_length: %d, num_labels: %d' % (input_length, num_labels))
        print('lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr,batch_size, rand_weight, embedding_weight_trainable))
        if not rand_weight:
            print('W2V model file_path: %s' % word2vec_model_file_path)
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            padding_mode='center',
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            print('=' * 40)
            print('num_filter is %d.' % num_filter)
            _, _, middle_output_dev, middle_output_val = get_val_score(
                WordEmbeddingCNNWithOneConv,
                cv_data=cv_data[:],
                verbose=verbose,
                num_filter=num_filter,
                num_labels=num_labels,
                word2vec_model_file_path=word2vec_model_file_path,
                embedding_weight_trainable=embedding_weight_trainable,
                get_cnn_middle_layer_output=get_cnn_middle_layer_output,
                need_validation=need_validation,
                rand_weight=rand_weight,
                batch_size=batch_size,
                lr=lr,
            )

            if get_cnn_middle_layer_output:
                # 保存结果
                with open(middle_layer_output_file, 'w') as fout:
                    # 保存中间结果
                    pickle.dump(cv_data, fout)
                    pickle.dump(middle_output_dev, fout)
                    pickle.dump(middle_output_val, fout)
示例#17
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        shuffle_data=True,
        n_estimators_list=None,
        feature_type='word',
        word2vec_to_solve_oov=False,
        word2vec_model_file_path=None,
        verbose=0,
        cv=3,
        need_transform_input=True,
        need_segmented=True,
        need_validation=True,
        include_train_data=True,
    ):
        """进行参数的交叉验证

        Parameters
        ----------
        word2vec_model_file_path : str
            word2vec模型路径
        train_data : (array-like,array-like)
            训练数据 (train_X,train_y)
        test_data : (array-like,array-like)
            测试数据 (test_X,test_y)
        cv_data : array-like
            k份验证数据
        word2vec_to_solve_oov : bool
            是否使用 w2v 去替换
        n_estimators_list : array-like
            验证参数,随机森林棵树
        feature_type : str
            特征类型, only in ['word','seg','word_seg']
        shuffle_data : bool
            是否打乱数据
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_transform_input : bool
            是否需要转换数据
        need_segmented:bool
            是否需要分词
        include_train_data:
            是否包含训练数据一样验证
        need_validation:
            是否要验证
        """

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )
        # region 2. 将数据进行特征编码转换
        if need_transform_input:
            feature_encoder = BowRandomForest.get_feature_encoder(
                verbose=verbose,
                need_segmented=need_segmented,
                feature_type=feature_type,
                word2vec_to_solve_oov=word2vec_to_solve_oov,
                word2vec_model_file_path=word2vec_model_file_path,
            )
            # diff_train_val_feature_encoder=1 每次feature encoder 都不同
            cv_data = transform_cv_data(feature_encoder,
                                        cv_data,
                                        verbose=verbose,
                                        diff_train_val_feature_encoder=1)
        else:
            if len(cv_data[0]) < 6:
                # cv_data 每项都需要 6项, 不够则补齐
                cv_data = [item + [None] for item in cv_data]
        # endregion

        # region 3. 交叉验证
        for n_estimators in n_estimators_list:
            print('=' * 40)
            print('n_estimators is %d.' % n_estimators)
            get_val_score(
                BowRandomForest,
                cv_data=cv_data[:],
                verbose=verbose,
                shuffle_data=shuffle_data,
                need_validation=need_validation,
                n_estimators=n_estimators,
            )
示例#18
0
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            shuffle_data=True,
            n_estimators_list=None,
            feature_type='word',
            word2vec_to_solve_oov=False,
            word2vec_model_file_path=None,
            verbose=0,
            cv=3,
            need_transform_input=True,
            need_segmented=True,
            need_validation=True,
            include_train_data=True,
    ):
        """进行参数的交叉验证

        Parameters
        ----------
        word2vec_model_file_path : str
            word2vec模型路径
        train_data : (array-like,array-like)
            训练数据 (train_X,train_y)
        test_data : (array-like,array-like)
            测试数据 (test_X,test_y)
        cv_data : array-like
            k份验证数据
        word2vec_to_solve_oov : bool
            是否使用 w2v 去替换
        n_estimators_list : array-like
            验证参数,随机森林棵树
        feature_type : str
            特征类型, only in ['word','seg','word_seg']
        shuffle_data : bool
            是否打乱数据
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_transform_input : bool
            是否需要转换数据
        need_segmented:bool
            是否需要分词
        include_train_data:
            是否包含训练数据一样验证
        need_validation:
            是否要验证
        """

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )
        # region 2. 将数据进行特征编码转换
        if need_transform_input:
            feature_encoder = BowRandomForest.get_feature_encoder(
                verbose=verbose,
                need_segmented=need_segmented,
                feature_type=feature_type,
                word2vec_to_solve_oov=word2vec_to_solve_oov,
                word2vec_model_file_path=word2vec_model_file_path,
            )
            # diff_train_val_feature_encoder=1 每次feature encoder 都不同
            cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1)
        else:
            if len(cv_data[0]) < 6:
                # cv_data 每项都需要 6项, 不够则补齐
                cv_data = [item + [None] for item in cv_data]
        # endregion

        # region 3. 交叉验证
        for n_estimators in n_estimators_list:
            print('=' * 40)
            print('n_estimators is %d.' % n_estimators)
            get_val_score(BowRandomForest,
                          cv_data=cv_data[:],
                          verbose=verbose,
                          shuffle_data=shuffle_data,
                          need_validation=need_validation,
                          n_estimators=n_estimators,
                          )
示例#19
0
from deep_learning.cnn.wordEmbedding_cnn.multichannel_onehot_cnn_model import MultiChannelOnehotBowCNN
# 获取该分类器的编码器
word_feature_encoder, seg_feature_encoder = MultiChannelOnehotBowCNN.get_feature_encoder(
    **{'word_input_length': word_input_length,
     'seg_input_length': seg_input_length}
)


train_y = train_data['LABEL_INDEX'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()

cv_data = data_util.get_k_fold_data(k=3,
                                    data=train_data,
                                    rand_seed=3,
                                    )
all_cv_word_data = transform_cv_data(word_feature_encoder, cv_data,
                                     (test_data[u'SENTENCE'].as_matrix(),test_y),**config)

all_cv_seg_data = transform_cv_data(seg_feature_encoder, cv_data, (test_data[u'SENTENCE'].as_matrix(),test_y), **config)

train_X_word_feature, train_y, test_X_word_feature, test_y,word_feature_encoder = all_cv_word_data[0]
train_X_seg_feature, train_y, test_X_seg_feature, test_y,seg_feature_encoder = all_cv_seg_data[0]



logging.debug('=' * 20)
# ****************************************************************
# ------------- region end : 2. 转换数据的格式并特征编码 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 3、构建onehot编码 +++++++++++++
    def cross_validation(cv_data,test_data,result_file_path,**kwargs):
        '''
            进行参数的交叉验证

        :param cv_data: k份训练数据
        :type cv_data: array-like
        :param test_data: 测试数据
        :type test_data: array-like
        :return:
        '''

        nb_epoch = kwargs['nb_epoch']
        verbose = kwargs['verbose']
        num_labels = 24
        feature_type = kwargs['feature_type']
        remove_stopword = kwargs['remove_stopword']

        word2vec_to_solve_oov = kwargs['word2vec_to_solve_oov']
        rand_seed = kwargs['rand_seed']
        l1_conv_filter_type = kwargs['l1_conv_filter_type']
        l2_conv_filter_type = kwargs['l2_conv_filter_type']
        k = kwargs['k']

        # 详细结果保存到...
        detail_result_file_path = result_file_path
        fout = open(detail_result_file_path, 'w')

        print('=' * 150)

        print('使用word2vec:%s\nfeature_type:%s\nremove_stopword:%s\nnb_epoch:%d\nrand_seed:%d' % (word2vec_to_solve_oov,feature_type,remove_stopword,nb_epoch,rand_seed))
        print('l1_conv_filter_type:%s' % l1_conv_filter_type)
        print('l2_conv_filter_type:%s' % l2_conv_filter_type)
        print('k:%s' % k)
        print('=' * 150)

        fout.write('=' * 150 + '\n')
        fout.write('single单通道CNN-bow cv结果:\n')
        fout.write('feature_type:%s\nnb_epoch:%d\nrand_seed:%d\n' % (feature_type, nb_epoch, rand_seed))
        fout.write('l1_conv_filter_type:%s\n' % l1_conv_filter_type)
        fout.write('l2_conv_filter_type:%s\n' % l2_conv_filter_type)
        fout.write('k:%s\n' % k)
        fout.write('=' * 150 + '\n')

        from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
        from data_processing_util.cross_validation_util import transform_cv_data
        feature_encoder = FeatureEncoder(
            need_segmented=True,
            full_mode=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            feature_method='bow',
            max_features=2000,
            **kwargs
        )

        all_cv_data = transform_cv_data(feature_encoder,cv_data,test_data,**kwargs)

        for layer1 in kwargs['layer1']:
            for layer2 in kwargs['layer2']:
                for hidden1 in kwargs['hidden1']:
                    for hidden2 in kwargs['hidden2']:

                        print('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d' % (layer1, layer2, hidden1, hidden2))

                        fout.write('=' * 150 + '\n')
                        fout.write('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d\n' % (layer1,
                                                                                    layer2,
                                                                                    hidden1,
                                                                                    hidden2
                                                                                    ))
                        # 五折
                        print('K折交叉验证开始...')
                        counter = 0
                        test_acc = []
                        train_acc = []
                        for dev_X, dev_y, val_X, val_y in all_cv_data:
                            # print(dev_X2.shape)
                            print('-' * 80)
                            fout.write('-' * 80 + '\n')
                            if counter==0:
                                # 第一个数据是训练,之后是交叉验证
                                print('训练:' )
                                fout.write('训练\n')
                            else:
                                print('第%d个验证' % counter)
                                fout.write('第%d个验证\n' % counter)

                            bow_cnn = SingleChannelBowCNN(
                                rand_seed=rand_seed,
                                verbose=verbose,
                                feature_encoder=None,
                                num_labels=num_labels,
                                input_length=dev_X.shape[1],
                                l1_conv_filter_type=[
                                    # [layer1, l1_conv_filter_type[0], -1, 'valid', (k[0], 1), 0.,'relu', 'none'],
                                    # [layer1, l1_conv_filter_type[1], -1, 'valid', (k[0], 1), 0.,'relu', 'none'],
                                    # [layer1, l1_conv_filter_type[2], -1, 'valid', (k[0], 1), 0.,'relu', 'batch_normalization'],
                                ],
                                l2_conv_filter_type=[
                                    [layer2, l2_conv_filter_type[0], -1, 'valid', (k[1], 1), 0.,'relu', 'batch_normalization']
                                ],
                                full_connected_layer_units=[(hidden1,0.5,'relu', 'none'),
                                                            (hidden2,0.5,'relu', 'none')],
                                nb_epoch=nb_epoch,
                                earlyStoping_patience=50,
                                optimizers='sgd',
                                batch_size=32,
                                lr=1e-2,
                            )

                            # bow_cnn.print_model_descibe()

                            dev_loss, dev_accuracy, \
                            val_loss, val_accuracy = bow_cnn.fit((dev_X, dev_y), (val_X, val_y))

                            print('dev:%f,%f' % (dev_loss, dev_accuracy))
                            print('val:%f,%f' % (val_loss, val_accuracy))
                            fout.write('dev:%f,%f\n' % (dev_loss, dev_accuracy))
                            fout.write('val:%f,%f\n' % (val_loss, val_accuracy))
                            test_acc.append(val_accuracy)
                            train_acc.append(dev_accuracy)
                            counter += 1

                        print('k折验证结果:%s' % test_acc)
                        print('验证中平均准确率:%f'%np.average(test_acc[1:]))
                        print('-' * 80)

                        fout.write('k折验证训练结果:%s\n' % train_acc)
                        fout.write('k折验证测试结果:%s\n' % test_acc)
                        fout.write('平均:%f\n' % np.average(test_acc[1:]))
                        fout.write('-' * 80 + '\n')
                        fout.flush()
        fout.close()