def get_k_fold_data( self, k=5, data=None, rand_seed=0, ): ''' 将数据分为K-fold :param k: :param data: :type data: pd.DataFrame() :return: ''' train_X = data['SENTENCE'].as_matrix() train_y = data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for x, y in data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=rand_seed): cv_x.append(x) cv_y.append(y) return cv_x, cv_y
def get_k_fold_data(self, k=5, data=None, rand_seed = 0, ): ''' 将数据分为K-fold :param k: :param data: :type data: pd.DataFrame() :return: ''' train_X = data['SENTENCE'].as_matrix() train_y = data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for x, y in data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=rand_seed): cv_x.append(x) cv_y.append(y) return cv_x,cv_y
def process_train_data_for_k_fold(k=3): ''' 将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。 处理文件: v2.2/v2.2_train_Sa_884.csv 输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv :return: ''' from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() feature_type = 'seg' # L or Sa dataset_type = 'S' config = {'dataset_type':'v2.3(%s)'%dataset_type, 'verbose':1, 'label_version':'v2.0' } train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index(config['label_version']) # print(train_data.head()) train_X = train_data['SENTENCE'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() test_X = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for index,(x,y) in enumerate(data_split_k_fold(k=k,data=(train_X,train_y),rand_seed=3)): cv_x.append(x) cv_y.append(y) # print(y) y = [index_to_label[item] for item in y] cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x}) data_util.save_data(cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' % (dataset_type,index, len(cv_data))) print(len(x)) # quit() feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type=feature_type, max_features=2000, ) all_cv_data = transform_cv_data(feature_encoder, (cv_x,cv_y), (test_X,test_y),verbose=1) counter = 0 for dev_X, dev_y, val_X, val_y in all_cv_data: counter+=1 dev = np.concatenate((dev_y.reshape(-1,1),dev_X),axis=1) val = np.concatenate((val_y.reshape(-1,1),val_X),axis=1) print(dev_X.shape) print(len(dev_y)) print(dev.shape) print(val_X.shape) print(len(val_y)) print(val.shape) np.savetxt('result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv'%( dataset_type, feature_type, counter,len(dev)), dev, fmt='%d', delimiter=',', ) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv'%(dataset_type, feature_type, counter, len(val)), val, fmt='%d', delimiter=',', )
def process_train_data_for_k_fold(k=3): ''' 将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。 处理文件: v2.2/v2.2_train_Sa_884.csv 输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv :return: ''' from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() feature_type = 'seg' # L or Sa dataset_type = 'S' config = { 'dataset_type': 'v2.3(%s)' % dataset_type, 'verbose': 1, 'label_version': 'v2.0' } train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index( config['label_version']) # print(train_data.head()) train_X = train_data['SENTENCE'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() test_X = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for index, (x, y) in enumerate( data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=3)): cv_x.append(x) cv_y.append(y) # print(y) y = [index_to_label[item] for item in y] cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x}) data_util.save_data( cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' % (dataset_type, index, len(cv_data))) print(len(x)) # quit() feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type=feature_type, max_features=2000, ) all_cv_data = transform_cv_data(feature_encoder, (cv_x, cv_y), (test_X, test_y), verbose=1) counter = 0 for dev_X, dev_y, val_X, val_y in all_cv_data: counter += 1 dev = np.concatenate((dev_y.reshape(-1, 1), dev_X), axis=1) val = np.concatenate((val_y.reshape(-1, 1), val_X), axis=1) print(dev_X.shape) print(len(dev_y)) print(dev.shape) print(val_X.shape) print(len(val_y)) print(val.shape) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv' % (dataset_type, feature_type, counter, len(dev)), dev, fmt='%d', delimiter=',', ) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' % (dataset_type, feature_type, counter, len(val)), val, fmt='%d', delimiter=',', )