def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            feature_type='word',
            input_length=None,
            num_filter_list=None,
            verbose=0,
            cv=3,
            batch_size=32,
            lr=1e-2,
            need_segmented=True,
            word2vec_model_file_path=None,
            num_labels=24,
            embedding_weight_trainable=False,
            # 获取中间层输出
            get_cnn_middle_layer_output=False,
            middle_layer_output_file=None,
            rand_weight=False,
            need_validation=True,
            include_train_data=True,
            vocabulary_including_test_set=True,
    ):
        """

        Parameters
        ----------
        train_data : array-like
            训练数据 (train_X, train_y))
        test_data : array-like
            测试数据
        cv_data : array-like
            k份验证数据
        input_length : int
            输入长度
        num_filter_list : array-like
            验证参数,number of filters
        middle_layer_output_file : str
            中间层输出到哪个文件
        get_cnn_middle_layer_output : bool
            是否获取中间层输出(#,False)
        num_labels: int
            标签
        batch_size : int
            batch size
        vocabulary_including_test_set: bool,default,True
            字典是否包括测试集
        include_train_data : bool
            是否包含训练数据一样验证
        need_validation: bool
            是否要验证
        embedding_weight_trainable : bool
            切换 CNN(static-w2v) 和 CNN(non-static-w2v)
        rand_weight : bool
            切换 CNN(rand) or CNN(static/non-static-w2v)
        feature_type : str
            特征类型
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_segmented:bool
            是否需要分词
        word2vec_model_file_path

        Notes
        ----------
        - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典
        - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同

        Examples
        ----------
        >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊']
        >>> train_y = [1, 2, 3, 2, 3]
        >>> test_x = ['你好', '不错哟']
        >>> test_y = [1, 2]
        >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']]
        >>> cv_y = [[1, 3], [2, 2], [3]]
        >>> WordEmbeddingCNNWithOneConv.cross_validation(
        >>>         train_data = (train_x,train_y),
        >>>         test_data=(test_x,test_y),
        >>>         input_length=8,
        >>>         num_filter_list=[5,50],
        >>>         verbose=1,
        >>>         word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem',
        >>>     )

        """
        print('=' * 80)
        print('feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type,
                                                                                      need_segmented,
                                                                                      vocabulary_including_test_set))
        print('input_length: %d, num_labels: %d' % (input_length, num_labels))
        print('lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr,batch_size, rand_weight, embedding_weight_trainable))
        if not rand_weight:
            print('W2V model file_path: %s' % word2vec_model_file_path)
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            padding_mode='center',
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            print('=' * 40)
            print('num_filter is %d.' % num_filter)
            _, _, middle_output_dev, middle_output_val = get_val_score(
                WordEmbeddingCNNWithOneConv,
                cv_data=cv_data[:],
                verbose=verbose,
                num_filter=num_filter,
                num_labels=num_labels,
                word2vec_model_file_path=word2vec_model_file_path,
                embedding_weight_trainable=embedding_weight_trainable,
                get_cnn_middle_layer_output=get_cnn_middle_layer_output,
                need_validation=need_validation,
                rand_weight=rand_weight,
                batch_size=batch_size,
                lr=lr,
            )

            if get_cnn_middle_layer_output:
                # 保存结果
                with open(middle_layer_output_file, 'w') as fout:
                    # 保存中间结果
                    pickle.dump(cv_data, fout)
                    pickle.dump(middle_output_dev, fout)
                    pickle.dump(middle_output_val, fout)
Пример #2
0
label_to_index,index_to_label = data_util.get_label_index(version=config['label_version'])

# ****************************************************************
# ------------- region end : 1. 加载训练数据和测试数据 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('2. 转换数据的格式并特征编码')
from data_processing_util.cross_validation_util import transform_cv_data
from deep_learning.cnn.wordEmbedding_cnn.wordEmbedding_cnn_model import WordEmbeddingCNN

feature_encoder = WordEmbeddingCNN.get_feature_encoder(
    **{'input_length': input_length,
       'feature_type':feature_type,}
)

train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix())
feature_encoder.print_model_descibe()
feature_encoder.print_sentence_length_detail()

# train_y = train_data['LABEL_INDEX'].as_matrix()

test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix())

test_all_y = test_data['LABEL_INDEX'].as_matrix()
print(train_data['LABEL_INDEX'].as_matrix())

print(train_X_feature.shape)
print(test_all_X_feature.shape)
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        feature_type='word',
        input_length=None,
        num_filter_list=None,
        verbose=0,
        cv=3,
        batch_size=32,
        lr=1e-2,
        need_segmented=True,
        word2vec_model_file_path=None,
        num_labels=24,
        embedding_weight_trainable=False,
        shuffle_data=True,
        rand_weight=False,
        need_validation=True,
        include_train_data=True,
        vocabulary_including_test_set=True,
        n_estimators_list=None,
    ):

        print('=' * 80)
        print(
            'feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s'
            % (feature_type, need_segmented, vocabulary_including_test_set))
        print('rand_weight:%s,embedding_weight_trainable:%s' %
              (rand_weight, embedding_weight_trainable))
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder,
                                    cv_data,
                                    verbose=verbose,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            for n_estimators in n_estimators_list:
                print('=' * 40)
                print('num_filter and n_estimators is %d,%d.' %
                      (num_filter, n_estimators))
                get_val_score(
                    RFAndRFAndWordEmbeddingCnnMerge,
                    num_filter=num_filter,
                    n_estimators=n_estimators,
                    cv_data=cv_data[:],
                    verbose=verbose,
                    num_labels=num_labels,
                    word2vec_model_file_path=word2vec_model_file_path,
                    embedding_weight_trainable=embedding_weight_trainable,
                    need_validation=need_validation,
                    rand_weight=rand_weight,
                    batch_size=batch_size,
                    lr=lr,
                    shuffle_data=shuffle_data,
                )
Пример #4
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        feature_type='word',
        input_length=None,
        num_filter_list=None,
        verbose=0,
        cv=3,
        batch_size=32,
        lr=1e-2,
        need_segmented=True,
        word2vec_model_file_path=None,
        num_labels=24,
        embedding_weight_trainable=False,
        # 获取中间层输出
        get_cnn_middle_layer_output=False,
        middle_layer_output_file=None,
        rand_weight=False,
        need_validation=True,
        include_train_data=True,
        vocabulary_including_test_set=True,
    ):
        """

        Parameters
        ----------
        train_data : array-like
            训练数据 (train_X, train_y))
        test_data : array-like
            测试数据
        cv_data : array-like
            k份验证数据
        input_length : int
            输入长度
        num_filter_list : array-like
            验证参数,number of filters
        middle_layer_output_file : str
            中间层输出到哪个文件
        get_cnn_middle_layer_output : bool
            是否获取中间层输出(#,False)
        num_labels: int
            标签
        batch_size : int
            batch size
        vocabulary_including_test_set: bool,default,True
            字典是否包括测试集
        include_train_data : bool
            是否包含训练数据一样验证
        need_validation: bool
            是否要验证
        embedding_weight_trainable : bool
            切换 CNN(static-w2v) 和 CNN(non-static-w2v)
        rand_weight : bool
            切换 CNN(rand) or CNN(static/non-static-w2v)
        feature_type : str
            特征类型
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_segmented:bool
            是否需要分词
        word2vec_model_file_path

        Notes
        ----------
        - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典
        - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同

        Examples
        ----------
        >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊']
        >>> train_y = [1, 2, 3, 2, 3]
        >>> test_x = ['你好', '不错哟']
        >>> test_y = [1, 2]
        >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']]
        >>> cv_y = [[1, 3], [2, 2], [3]]
        >>> WordEmbeddingCNNWithOneConv.cross_validation(
        >>>         train_data = (train_x,train_y),
        >>>         test_data=(test_x,test_y),
        >>>         input_length=8,
        >>>         num_filter_list=[5,50],
        >>>         verbose=1,
        >>>         word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem',
        >>>     )

        """
        print('=' * 80)
        print(
            'feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s'
            % (feature_type, need_segmented, vocabulary_including_test_set))
        print('input_length: %d, num_labels: %d' % (input_length, num_labels))
        print(
            'lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s'
            % (lr, batch_size, rand_weight, embedding_weight_trainable))
        if not rand_weight:
            print('W2V model file_path: %s' % word2vec_model_file_path)
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            padding_mode='center',
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder,
                                    cv_data,
                                    verbose=verbose,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            print('=' * 40)
            print('num_filter is %d.' % num_filter)
            _, _, middle_output_dev, middle_output_val = get_val_score(
                WordEmbeddingCNNWithOneConv,
                cv_data=cv_data[:],
                verbose=verbose,
                num_filter=num_filter,
                num_labels=num_labels,
                word2vec_model_file_path=word2vec_model_file_path,
                embedding_weight_trainable=embedding_weight_trainable,
                get_cnn_middle_layer_output=get_cnn_middle_layer_output,
                need_validation=need_validation,
                rand_weight=rand_weight,
                batch_size=batch_size,
                lr=lr,
            )

            if get_cnn_middle_layer_output:
                # 保存结果
                with open(middle_layer_output_file, 'w') as fout:
                    # 保存中间结果
                    pickle.dump(cv_data, fout)
                    pickle.dump(middle_output_dev, fout)
                    pickle.dump(middle_output_val, fout)