print('=' * 30) print 'start running!' logging.debug('=' * 30) logging.debug(config['describe']) logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) # **************************************************************** # +++++++++++++ region start : 1. 加载训练数据和测试数据 +++++++++++++ # **************************************************************** from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index() # **************************************************************** # ------------- region end : 1. 加载训练数据和测试数据 ------------- # **************************************************************** # **************************************************************** # +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++ # **************************************************************** logging.debug('=' * 20) logging.debug('2. 转换数据的格式并特征编码')
print('树:%s'%estimators) print('=' * 30) logging.debug('=' * 20) # **************************************************************** # ------------- region end : 参数设置 ------------- # **************************************************************** # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from traditional_classify.bow_rf.bow_rf_model import BowRandomForest data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index(version=config['label_version']) # print train_data['LABEL_INDEX'].unique() # print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 ------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('将数据转为特征')
logging.debug('=' * 30) logging.debug(config['describe']) logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) import os # **************************************************************** # +++++++++++++ region start : 1. 加载训练数据和测试数据 +++++++++++++ # **************************************************************** from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index() # **************************************************************** # ------------- region end : 1. 加载训练数据和测试数据 ------------- # **************************************************************** # **************************************************************** # +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++ # **************************************************************** logging.debug('=' * 20) logging.debug('2. 转换数据的格式并特征编码')
print config['describe'] print('=' * 30) print 'start running!' logging.debug('=' * 30) logging.debug(config['describe']) logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index() # print train_data['LABEL_INDEX'].unique() # print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 --------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('开始生成特征向量...') logging.debug('使用 %s 提取特征向量'%(config['model']))
print('l2_conv_filter_type:%s'%l2_conv_filter_type) print('k:%s'%k) print('=' * 150) logging.debug('=' * 20) # **************************************************************** # ------------- region end : 参数设置 ------------- # **************************************************************** # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index() # print train_data['LABEL_INDEX'].unique() # print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 --------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('开始生成特征向量...') logging.debug('使用 %s 提取特征向量' % (config['model']))
logging.basicConfig(filename=''.join(config['log_file_path']), filemode='w', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) start_time = timeit.default_timer() print('=' * 30) print config['describe'] print('=' * 30) print 'start running!' logging.debug('=' * 30) logging.debug(config['describe']) logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index() train_x = train_data['SENTENCE'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() test_x = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() from deep_learning.cnn.wordEmbedding_cnn.example.one_conv_layer_wordEmbedding_cnn import WordEmbeddingCNNWithOneConv input_length = 14 word_embedding_dim = 50 WordEmbeddingCNNWithOneConv.cross_validation( train_data=(train_x, train_y), test_data=(test_x, test_y),
print config['describe'] print('=' * 30) print 'start running!' logging.debug('=' * 30) logging.debug(config['describe']) logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index() # print train_data['LABEL_INDEX'].unique() # print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 --------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('开始生成特征向量...') logging.debug('使用 %s 提取特征向量'%(config['model']))
) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv'%(dataset_type, feature_type, counter, len(val)), val, fmt='%d', delimiter=',', ) if __name__ == '__main__': dutil = DataUtil() # process_train_data_Sa() # process_train_data_S() # process_train_data_L() # process_test_data() # # genernate_aiml_file() # process_train_data_sentence_length() # process_train_data_for_k_fold(3) # quit() dutil.print_data_detail(dutil.load_data('data/v2.3_train_Sa_891.csv')) # dutil.print_data_detail(dutil.load_data('data/v2.3_train_Sa_891.csv')) quit()
def process_train_data_for_k_fold(k=3): ''' 将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。 处理文件: v2.2/v2.2_train_Sa_884.csv 输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv :return: ''' from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() feature_type = 'seg' # L or Sa dataset_type = 'S' config = {'dataset_type':'v2.3(%s)'%dataset_type, 'verbose':1, 'label_version':'v2.0' } train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index(config['label_version']) # print(train_data.head()) train_X = train_data['SENTENCE'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() test_X = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for index,(x,y) in enumerate(data_split_k_fold(k=k,data=(train_X,train_y),rand_seed=3)): cv_x.append(x) cv_y.append(y) # print(y) y = [index_to_label[item] for item in y] cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x}) data_util.save_data(cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' % (dataset_type,index, len(cv_data))) print(len(x)) # quit() feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type=feature_type, max_features=2000, ) all_cv_data = transform_cv_data(feature_encoder, (cv_x,cv_y), (test_X,test_y),verbose=1) counter = 0 for dev_X, dev_y, val_X, val_y in all_cv_data: counter+=1 dev = np.concatenate((dev_y.reshape(-1,1),dev_X),axis=1) val = np.concatenate((val_y.reshape(-1,1),val_X),axis=1) print(dev_X.shape) print(len(dev_y)) print(dev.shape) print(val_X.shape) print(len(val_y)) print(val.shape) np.savetxt('result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv'%( dataset_type, feature_type, counter,len(dev)), dev, fmt='%d', delimiter=',', ) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv'%(dataset_type, feature_type, counter, len(val)), val, fmt='%d', delimiter=',', )
(dataset_type, feature_type, counter, len(dev)), dev, fmt='%d', delimiter=',', ) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' % (dataset_type, feature_type, counter, len(val)), val, fmt='%d', delimiter=',', ) if __name__ == '__main__': dutil = DataUtil() # process_train_data_Sa() # process_train_data_S() # process_train_data_L() # process_test_data() # # genernate_aiml_file() # process_train_data_sentence_length() # process_train_data_for_k_fold(3) # quit() dutil.print_data_detail(dutil.load_data('data/v2.3_train_Sa_891.csv')) # dutil.print_data_detail(dutil.load_data('data/v2.3_train_Sa_891.csv')) quit()
def process_train_data_for_k_fold(k=3): ''' 将训练数据分成 K-份 ,以进行交叉验证,尽量按类别分。 处理文件: v2.2/v2.2_train_Sa_884.csv 输出文件: v2.2/v2.2_train_Sa_i%d_%d.csv :return: ''' from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() feature_type = 'seg' # L or Sa dataset_type = 'S' config = { 'dataset_type': 'v2.3(%s)' % dataset_type, 'verbose': 1, 'label_version': 'v2.0' } train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index( config['label_version']) # print(train_data.head()) train_X = train_data['SENTENCE'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() test_X = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for index, (x, y) in enumerate( data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=3)): cv_x.append(x) cv_y.append(y) # print(y) y = [index_to_label[item] for item in y] cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x}) data_util.save_data( cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' % (dataset_type, index, len(cv_data))) print(len(x)) # quit() feature_encoder = FeatureEncoder( verbose=0, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type=feature_type, max_features=2000, ) all_cv_data = transform_cv_data(feature_encoder, (cv_x, cv_y), (test_X, test_y), verbose=1) counter = 0 for dev_X, dev_y, val_X, val_y in all_cv_data: counter += 1 dev = np.concatenate((dev_y.reshape(-1, 1), dev_X), axis=1) val = np.concatenate((val_y.reshape(-1, 1), val_X), axis=1) print(dev_X.shape) print(len(dev_y)) print(dev.shape) print(val_X.shape) print(len(val_y)) print(val.shape) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv' % (dataset_type, feature_type, counter, len(dev)), dev, fmt='%d', delimiter=',', ) np.savetxt( 'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' % (dataset_type, feature_type, counter, len(val)), val, fmt='%d', delimiter=',', )
fout.write('l2_conv_filter_type:%s\n'%l2_conv_filter_type) fout.write('k:%s\n'%k) fout.write('=' * 150+'\n') logging.debug('=' * 20) # **************************************************************** # ------------- region end : 参数设置 ------------- # **************************************************************** # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index() # print train_data['LABEL_INDEX'].unique() # print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 --------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('开始生成特征向量...') logging.debug('使用 %s 提取特征向量'%(config['model']))
logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) from data_processing_util.jiebanlp.jieba_util import Jieba_Util from sklearn.feature_extraction.text import CountVectorizer from collections import Counter from gensim.models import Word2Vec # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index(version=config['label_version']) # print train_data['LABEL_INDEX'].unique() # print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ train_data = train_data[['LABEL','SENTENCE']] test_data = test_data[['LABEL','SENTENCE']]
print('l2_conv_filter_type:%s' % l2_conv_filter_type) print('k:%s' % k) print('=' * 150) logging.debug('=' * 20) # **************************************************************** # ------------- region end : 参数设置 ------------- # **************************************************************** # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder data_util = DataUtil() train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index() # print train_data['LABEL_INDEX'].unique() # print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 --------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('开始生成特征向量...') # # logging.debug('使用 %s 提取特征向量' % (config['model']))
config = config['main'] logging.basicConfig(filename=''.join(config['log_file_path']), filemode='w', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) start_time = timeit.default_timer() print('=' * 30) print(config['describe']) print('=' * 30) print('start running!') logging.debug('=' * 30) logging.debug(config['describe']) logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() # **************************************************************** # +++++++++++++ region start : 参数调节 +++++++++++++ # **************************************************************** logging.debug('=' * 20) logging.debug('参数调节') print('='*50) batch_size = 100 feature_type = 'word' full_mode = False sentence_padding_length = 15 word_embedding_dim = 50 word2vec_model_file_path = data_util.transform_word2vec_model_name('%dd_v2.3Sa_word' % word_embedding_dim) # word2vec_model_file_path = data_util.transform_word2vec_model_name('%dd_weibo_100w' % word_embedding_dim)
logging.debug('=' * 30) logging.debug(config['describe']) logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder from traditional_classify.bow_rf.bow_rf_model import BowRandomForest data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index() # print train_data['LABEL_INDEX'].unique() # print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 ------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('将数据转为特征')
# encoding=utf8 """ Author: 'jdwang' Date: 'create date: 2016-09-27'; 'last updated date: 2016-09-27' Email: '*****@*****.**' Describe: IALP paper - Dialogue Act Recognition for Chinese Out-of-Domain Utterances using Hybrid CNN-RF RF(BOC/BOW) 模型 """ from __future__ import print_function from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() # **************************************************************** # +++++++++++++ region start : 参数设置 +++++++++++++ # **************************************************************** print('=' * 30) config = { 'dataset_type': 'v2.3(Sa)', 'label_version': 'v2.0', 'verbose': 1, } word2vec_to_solve_oov = False feature_type = 'seg' seed = 64003 estimator_paramter_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000] # estimator_paramter_list = [2000] print('word2vec_to_solve_oov:%s\nrand_seed:%s\nfeature_type:%s' % (word2vec_to_solve_oov, seed, feature_type))
from jiebanlp.toolSet import seg from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.grid_search import GridSearchCV from gensim.models import Word2Vec import pickle # ------------------------------------------------------------------------------ # -------------- region start : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() train_data,test_data,label_to_index,index_to_label = data_util.load_train_test_data(config) # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # -------------- region start : 将数据转为特征 ------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('将数据转为特征') logging.debug('=' * 20) logging.debug('对数据进行分词...') logging.debug('-' * 20)
layer1,hidden1 = 5,500 verbose = 1 print('word_input_length:%d\nseg_input_length:%d'%(word_input_length,seg_input_length)) print('layer1:%d\nhidden1:%d'%(layer1,hidden1)) print('=' * 30) # **************************************************************** # +++++++++++++ region start : 1. 加载训练数据和测试数据 +++++++++++++ # **************************************************************** from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index() # **************************************************************** # ------------- region end : 1. 加载训练数据和测试数据 ------------- # **************************************************************** # **************************************************************** # +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++ # **************************************************************** logging.debug('=' * 20) logging.debug('2. 转换数据的格式并特征编码') from deep_learning.cnn.wordEmbedding_cnn.multichannel_onehot_cnn_model import MultiChannelOnehotBowCNN # 获取该分类器的编码器
print('=' * 30) print 'start running!' logging.debug('=' * 30) logging.debug(config['describe']) logging.debug('=' * 30) logging.debug('start running!') logging.debug('=' * 20) # **************************************************************** # +++++++++++++ region start : 1. 加载训练数据和测试数据 +++++++++++++ # **************************************************************** from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() train_data,test_data = data_util.load_train_test_data(config) label_to_index,index_to_label = data_util.get_label_index() # **************************************************************** # ------------- region end : 1. 加载训练数据和测试数据 ------------- # **************************************************************** # **************************************************************** # +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++ # **************************************************************** logging.debug('=' * 20) logging.debug('2. 转换数据的格式并特征编码')