def get_feature_encoder(**kwargs): ''' 获取该分类器的特征编码器 :param kwargs: 可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。 :return: ''' assert kwargs.has_key('input_length'), '请提供 input_length 的属性值' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder from data_processing_util.feature_encoder.feature_encoder_merge import FeatureEncoderMerge w2v_feature_encoder = FeatureEncoder( sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), need_segmented=True, full_mode=kwargs.get('full_mode', False), remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), word2vec_to_solve_oov=kwargs.get('word2vec_to_solve_oov', False), word2vec_model_file_path=kwargs.get('word2vec_model_file_path', None)) bow_feature_encoder = FeatureEncoder( sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), need_segmented=True, full_mode=kwargs.get('full_mode', False), replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, word2vec_to_solve_oov=kwargs.get('word2vec_to_solve_oov', False), word2vec_model_file_path=kwargs.get('word2vec_model_file_path', None)) return FeatureEncoderMerge(bow_feature_encoder=bow_feature_encoder, w2v_feature_encoder=w2v_feature_encoder)
def get_feature_encoder(**kwargs): ''' 获取该分类器的特征编码器 :param kwargs: 可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。 :return: ''' assert kwargs.has_key('input_length'), '请提供 input_length 的属性值' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), need_segmented=True, full_mode=kwargs.get('full_mode', False), remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word') ) return feature_encoder
def get_feature_encoder(**kwargs): """ 获取该分类器的特征编码器 :param kwargs: 可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。 :return: """ assert kwargs.has_key('input_length'), '请提供 input_length 的属性值' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( need_segmented=kwargs.get('need_segmented', True), sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), full_mode=kwargs.get('full_mode', False), remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode=kwargs.get('padding_mode', 'center'), add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), vocabulary_including_test_set=kwargs.get( 'vocabulary_including_test_set', True), update_dictionary=kwargs.get('update_dictionary', True)) return feature_encoder
def get_feature_encoder(**kwargs): ''' 返回 该模型的输入 特征编码器 :param kwargs: 可设置参数 [ sentence_padding_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],word2vec_to_solve_oov[#,False],word2vec_model_file_path[#,None],加*表示必须提供,加#表示可选,不写则默认。 :return: ''' assert kwargs.has_key('input_length'), '请提供 input_length 的属性值' feature_encoder = FeatureEncoder( sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), need_segmented=True, full_mode=kwargs.get('full_mode', False), replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, word2vec_to_solve_oov=kwargs.get('word2vec_to_solve_oov', False), word2vec_model_file_path=kwargs.get('word2vec_model_file_path', None)) if kwargs.get('verbose', 0) > 0: pprint.pprint(kwargs) return feature_encoder
def get_feature_encoder(**kwargs): ''' 获取该分类器的特征编码器 :param kwargs: word_input_length,seg_input_length :return: ''' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder word_feature_encoder = FeatureEncoder( sentence_padding_length=kwargs['word_input_length'], verbose=0, need_segmented=True, full_mode=False, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type='word', zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, ) seg_feature_encoder = FeatureEncoder( sentence_padding_length=kwargs['seg_input_length'], verbose=0, need_segmented=True, full_mode=False, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type='seg', zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, ) return word_feature_encoder,seg_feature_encoder
def test_onehot_bow_cnn(): # 使用样例 train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['句子', '你好', '你妹'] test_y = [2, 3, 0] sentence_padding_length = 8 from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder word_feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type='word', zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, ) train_X_word_feature = word_feature_encoder.fit_transform(train_X) test_X_word_feature = word_feature_encoder.transform(test_X) print(','.join(word_feature_encoder.vocabulary)) print train_X_word_feature.shape print train_X_word_feature seg_feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='left', add_unkown_word=True, feature_type='seg', zhs2zht=True, remove_url=True, # 设置为True,输出 onehot array to_onehot_array=True, ) train_X_seg_feature = seg_feature_encoder.fit_transform(train_X) test_X_seg_feature = seg_feature_encoder.transform(test_X) print(','.join(seg_feature_encoder.vocabulary)) print train_X_seg_feature.shape print train_X_seg_feature # quit() onehot_cnn = MultiChannelOnehotBowCNN( rand_seed=1377, verbose=1, feature_encoder=(word_feature_encoder,seg_feature_encoder), # optimizers='adadelta', optimizers='sgd', word_input_length=sentence_padding_length, seg_input_length=sentence_padding_length, word_input_dim=word_feature_encoder.vocabulary_size, seg_input_dim=seg_feature_encoder.vocabulary_size, num_labels=5, l1_conv_filter_type=[ [1, 2, -1, 'valid', (0, 1), 0., 'relu', 'none'], [1, 3, -1, 'valid', (0, 1), 0., 'relu', 'none'], [1, -1, -1, 'bow', (0, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[ # [16, 2, -1, 'valid',(2,1),0.5, 'relu', 'none'] ], full_connected_layer_units=[ (50, 0.5, 'relu', 'none'), ], embedding_dropout_rate=0., nb_epoch=30, nb_batch=5, earlyStoping_patience=20, lr=1e-2, ) onehot_cnn.print_model_descibe() # 训练模型 # 从保存的pickle中加载模型 # onehot_cnn.model_from_pickle('model/modelA.pkl') print(onehot_cnn.fit(([train_X_word_feature,train_X_seg_feature], trian_y), ([test_X_word_feature,test_X_seg_feature], test_y))) print(trian_y) # loss, train_accuracy = onehot_cnn.model.evaluate(train_X_feature, trian_y) # onehot_cnn.accuracy((train_X_word_feature, trian_y), transform_input=False) print(onehot_cnn.batch_predict([test_X_word_feature,test_X_seg_feature], transform_input=False)) print(onehot_cnn.batch_predict_bestn([test_X_word_feature,test_X_seg_feature], transform_input=False, bestn=2)) quit() print onehot_cnn.batch_predict(test_X, transform_input=True) print onehot_cnn.predict(test_X[0], transform_input=True) onehot_cnn.accuracy((test_X, test_y), transform_input=True) # 保存模型 # onehot_cnn.save_model('model/modelA.pkl') print onehot_cnn.predict('你好吗', transform_input=True)
def count_word_freq(self, data): ''' 统计每个词 在各个类别中的次数,每个词有四个统计项: 1. FAVOR: 在favor类别中的出现的次数 2. AGAINST:在AGAINST类别中的出现的次数 3. NONE : 在NONE类别中的出现的次数 4. FREQ : 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE 5. SUPPORT: 最高词频词频项/(FREQ) :param data: :return: ''' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( train_data=data['WORDS'].as_matrix(), verbose=0, padding_mode='none', need_segmented=False, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, remove_url=True, sentence_padding_length=7, add_unkown_word=False, mask_zero=False, zhs2zht=True, ) # print feature_encoder.train_padding_index train_X_features = feature_encoder.to_onehot_array() np.save('result/train_X_feature', train_X_features) print train_X_features.shape print train_X_features[:5] vocabulary = feature_encoder.vocabulary print ','.join(vocabulary) print feature_encoder.vocabulary_size np.save('result/vocabulary', vocabulary) freq = np.sum(train_X_features, axis=0) favor_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'FAVOR'], axis=0) against_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'AGAINST'], axis=0) none_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'NONE'], axis=0) support = np.nan_to_num([ max(favor, against, none) / (1.0 * (favor + against + none)) for favor, against, none in zip(favor_freq, against_freq, none_freq) ]) print freq print favor_freq print against_freq print none_freq count_data = pd.DataFrame( data={ u'WORD': vocabulary, u'FAVOR': favor_freq, u'AGAINST': against_freq, u'NONE': none_freq, u'SUPPORT': support, u'FREQ': freq, }) count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'], ascending=False) count_data = count_data[[ u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT' ]] count_data.to_csv( 'result/word_count.csv', sep='\t', index=False, header=True, encoding='utf8', ) print count_data.head()
if __name__ == '__main__': # 使用样例 train_X = ['你好', '无聊', '测试句子', '今天天气不错', '我要买手机'] trian_y = [1, 3, 2, 2, 3] test_X = ['你好', '你好', '你妹'] test_y = [3, 1, 1] sentence_padding_length = 10 feature_encoder = FeatureEncoder( sentence_padding_length=sentence_padding_length, verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, mask_zero=True) train_X_features = feature_encoder.fit_transform(train_data=train_X) print(train_X_features) dcnn = DynamicCNN( rand_seed=1337, verbose=2, batch_size=1, vocab_size=feature_encoder.vocabulary_size, word_embedding_dim=48, # input_length=None,
# -------------- code start : 开始 ------------- # 将TARGET和TEXT字段进行拼接 train_X = (train_data['TARGET'] + ',' + train_data['TEXT']).as_matrix() test_X = (test_data['TARGET'] + ',' + test_data['TEXT']).as_matrix() train_y = train_data['STANCE'].map(label_to_index).as_matrix() test_y = test_data['STANCE'].map(label_to_index).as_matrix() feature_encoder = FeatureEncoder(train_data=train_X, sentence_padding_length=config['sentence_padding_length'], verbose=0, need_segmented=config['need_segmented'], full_mode=True, replace_number=True, remove_stopword=True, lowercase=True, padding_mode='center', add_unkown_word=True, mask_zero=True, zhs2zht=True, remove_url=True, ) train_X_feature = feature_encoder.train_padding_index test_X_feature = map(feature_encoder.transform_sentence, test_X) feature_encoder.print_sentence_length_detail print feature_encoder.vocabulary_size # print ','.join(sorted(feature_encoder.vocabulary)) # quit() feature_encoder.print_model_descibe()
def count_word_freq(): ''' 对文件(train_data/TaskAA_all_data_2986.csv)统计词频。 统计每个词 在各个类别中的次数,每个词有五个统计项: 1. FAVOR: 在favor类别中的出现的次数 2. AGAINST:在AGAINST类别中的出现的次数 3. NONE : 在NONE类别中的出现的次数 4. FREQ : 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE 5. SUPPORT: 最高词频词频项/(FREQ) 步骤如下: 1. 将所有句子转成onehot编码 2. 统计每个词的5种统计值 :return: ''' # -------------- region start : 1. 将所有句子转成onehot编码,并保存数据 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 将所有句子转成onehot编码,并保存数据') print('1. 将所有句子转成onehot编码,并保存数据') # -------------- code start : 开始 ------------- from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder # print train_dataA.head() print(train_dataA.shape) feature_encoder = FeatureEncoder( train_data=train_dataA['WORDS'].as_matrix(), verbose=0, padding_mode='none', need_segmented=False, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, remove_url=True, sentence_padding_length=7, add_unkown_word=False, mask_zero=False, zhs2zht=True, ) # printfeature_encoder.train_padding_index train_X_features = feature_encoder.to_onehot_array() np.save('result/train_X_feature', train_X_features) print(train_X_features.shape) print(train_X_features[:5]) vocabulary = feature_encoder.vocabulary print(','.join(vocabulary)) print('字典个数有:%d' % feature_encoder.vocabulary_size) np.save('result/vocabulary', vocabulary) # -------------- code start : 结束 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) # -------------- region end : 1. 将所有句子转成onehot编码,并保存数据 --------------- # -------------- region start : 2. 统计每个词的5种统计值 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 统计每个词的5种统计值') print('2. 统计每个词的5种统计值') # -------------- code start : 开始 ------------- # 总词频 freq = np.sum(train_X_features, axis=0) # favor类中词频 favor_freq = np.sum( train_X_features[train_dataA['STANCE'].as_matrix() == u'FAVOR'], axis=0) # against类中词频 against_freq = np.sum( train_X_features[train_dataA['STANCE'].as_matrix() == u'AGAINST'], axis=0) # none类中词频 none_freq = np.sum( train_X_features[train_dataA['STANCE'].as_matrix() == u'NONE'], axis=0) # 支持度 :最高词频/总词频 support = np.nan_to_num([ max(favor, against, none) / (1.0 * (favor + against + none)) for favor, against, none in zip(favor_freq, against_freq, none_freq) ]) print(freq) print(favor_freq) print(against_freq) print(none_freq) count_data = pd.DataFrame( data={ u'WORD': vocabulary, u'FAVOR': favor_freq, u'AGAINST': against_freq, u'NONE': none_freq, u'SUPPORT': support, u'FREQ': freq, }) # 排序 count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'], ascending=False) count_data = count_data[[ u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT' ]] # 保存 count_data.to_csv( 'result/word_count_%d.csv' % feature_encoder.vocabulary_size, sep='\t', index=False, header=True, encoding='utf8', ) print(count_data.head()) # -------------- code start : 结束 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20)