예제 #1
0
    def get_k_fold_data(
        self,
        k=5,
        data=None,
        rand_seed=0,
    ):
        '''
            将数据分为K-fold

        :param k:
        :param data:
        :type data: pd.DataFrame()
        :return:
        '''

        train_X = data['SENTENCE'].as_matrix()
        train_y = data['LABEL_INDEX'].as_matrix()

        cv_x = []
        cv_y = []
        for x, y in data_split_k_fold(k=k,
                                      data=(train_X, train_y),
                                      rand_seed=rand_seed):
            cv_x.append(x)
            cv_y.append(y)
        return cv_x, cv_y
예제 #2
0
    def get_k_fold_data(self,
                        k=5,
                        data=None,
                        rand_seed = 0,
                        ):
        '''
            将数据分为K-fold

        :param k:
        :param data:
        :type data: pd.DataFrame()
        :return:
        '''

        train_X = data['SENTENCE'].as_matrix()
        train_y = data['LABEL_INDEX'].as_matrix()

        cv_x = []
        cv_y = []
        for x, y in data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=rand_seed):
            cv_x.append(x)
            cv_y.append(y)
        return cv_x,cv_y
예제 #3
0
def process_train_data_for_k_fold(k=3):
    '''
        将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。
        处理文件: v2.2/v2.2_train_Sa_884.csv
        输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv

    :return:
    '''

    from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
    from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

    data_util = DataUtil()

    feature_type = 'seg'
    # L or Sa
    dataset_type = 'S'

    config = {'dataset_type':'v2.3(%s)'%dataset_type,
              'verbose':1,
              'label_version':'v2.0'
              }
    train_data, test_data = data_util.load_train_test_data(config)
    label_to_index, index_to_label = data_util.get_label_index(config['label_version'])
    # print(train_data.head())
    train_X = train_data['SENTENCE'].as_matrix()
    train_y = train_data['LABEL_INDEX'].as_matrix()
    test_X = test_data['SENTENCE'].as_matrix()
    test_y = test_data['LABEL_INDEX'].as_matrix()

    cv_x = []
    cv_y = []
    for index,(x,y) in enumerate(data_split_k_fold(k=k,data=(train_X,train_y),rand_seed=3)):
        cv_x.append(x)
        cv_y.append(y)
        # print(y)
        y = [index_to_label[item] for item in y]
        cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x})
        data_util.save_data(cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' % (dataset_type,index, len(cv_data)))
        print(len(x))
    # quit()
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type=feature_type,
        max_features=2000,
    )

    all_cv_data = transform_cv_data(feature_encoder, (cv_x,cv_y), (test_X,test_y),verbose=1)

    counter = 0
    for dev_X, dev_y, val_X, val_y in all_cv_data:
        counter+=1
        dev = np.concatenate((dev_y.reshape(-1,1),dev_X),axis=1)
        val = np.concatenate((val_y.reshape(-1,1),val_X),axis=1)
        print(dev_X.shape)
        print(len(dev_y))
        print(dev.shape)
        print(val_X.shape)
        print(len(val_y))
        print(val.shape)
        np.savetxt('result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv'%(
            dataset_type,
            feature_type,
            counter,len(dev)),
                   dev,
                   fmt='%d',
                   delimiter=',',
                   )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv'%(dataset_type,
                                                        feature_type,
                                                        counter,
                                                        len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )
예제 #4
0
def process_train_data_for_k_fold(k=3):
    '''
        将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。
        处理文件: v2.2/v2.2_train_Sa_884.csv
        输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv

    :return:
    '''

    from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
    from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

    data_util = DataUtil()

    feature_type = 'seg'
    # L or Sa
    dataset_type = 'S'

    config = {
        'dataset_type': 'v2.3(%s)' % dataset_type,
        'verbose': 1,
        'label_version': 'v2.0'
    }
    train_data, test_data = data_util.load_train_test_data(config)
    label_to_index, index_to_label = data_util.get_label_index(
        config['label_version'])
    # print(train_data.head())
    train_X = train_data['SENTENCE'].as_matrix()
    train_y = train_data['LABEL_INDEX'].as_matrix()
    test_X = test_data['SENTENCE'].as_matrix()
    test_y = test_data['LABEL_INDEX'].as_matrix()

    cv_x = []
    cv_y = []
    for index, (x, y) in enumerate(
            data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=3)):
        cv_x.append(x)
        cv_y.append(y)
        # print(y)
        y = [index_to_label[item] for item in y]
        cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x})
        data_util.save_data(
            cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' %
            (dataset_type, index, len(cv_data)))
        print(len(x))
    # quit()
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type=feature_type,
        max_features=2000,
    )

    all_cv_data = transform_cv_data(feature_encoder, (cv_x, cv_y),
                                    (test_X, test_y),
                                    verbose=1)

    counter = 0
    for dev_X, dev_y, val_X, val_y in all_cv_data:
        counter += 1
        dev = np.concatenate((dev_y.reshape(-1, 1), dev_X), axis=1)
        val = np.concatenate((val_y.reshape(-1, 1), val_X), axis=1)
        print(dev_X.shape)
        print(len(dev_y))
        print(dev.shape)
        print(val_X.shape)
        print(len(val_y))
        print(val.shape)
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv' %
            (dataset_type, feature_type, counter, len(dev)),
            dev,
            fmt='%d',
            delimiter=',',
        )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' %
            (dataset_type, feature_type, counter, len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )