def count_word_freq(self,data): ''' 统计每个词 在各个类别中的次数,每个词有四个统计项: 1. FAVOR: 在favor类别中的出现的次数 2. AGAINST:在AGAINST类别中的出现的次数 3. NONE : 在NONE类别中的出现的次数 4. FREQ : 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE 5. SUPPORT: 最高词频词频项/(FREQ) :param data: :return: ''' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder(train_data=data['WORDS'].as_matrix(), verbose=0, padding_mode='none', need_segmented=False, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, remove_url=True, sentence_padding_length=7, add_unkown_word=False, mask_zero=False, zhs2zht=True, ) # print feature_encoder.train_padding_index train_X_features = feature_encoder.to_onehot_array() np.save('result/train_X_feature',train_X_features) print train_X_features.shape print train_X_features[:5] vocabulary = feature_encoder.vocabulary print ','.join(vocabulary) print feature_encoder.vocabulary_size np.save('result/vocabulary',vocabulary) freq = np.sum(train_X_features,axis=0) favor_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'FAVOR'],axis=0) against_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'AGAINST'],axis=0) none_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'NONE'],axis=0) support = np.nan_to_num([max(favor,against,none)/(1.0*(favor+against+none)) for favor,against,none in zip(favor_freq,against_freq,none_freq)]) print freq print favor_freq print against_freq print none_freq count_data = pd.DataFrame(data={ u'WORD':vocabulary, u'FAVOR':favor_freq, u'AGAINST':against_freq, u'NONE':none_freq, u'SUPPORT':support, u'FREQ':freq, }) count_data = count_data.sort_values(by=[u'SUPPORT',u'FREQ','WORD'],ascending=False) count_data = count_data[[u'WORD',u'FAVOR',u'AGAINST',u'NONE',u'FREQ',u'SUPPORT']] count_data.to_csv('result/word_count.csv', sep='\t', index=False, header=True, encoding='utf8', ) print count_data.head()
def count_word_freq(): ''' 对文件(train_data/TaskAA_all_data_2986.csv)统计词频。 统计每个词 在各个类别中的次数,每个词有五个统计项: 1. FAVOR: 在favor类别中的出现的次数 2. AGAINST:在AGAINST类别中的出现的次数 3. NONE : 在NONE类别中的出现的次数 4. FREQ : 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE 5. SUPPORT: 最高词频词频项/(FREQ) 步骤如下: 1. 将所有句子转成onehot编码 2. 统计每个词的5种统计值 :return: ''' # -------------- region start : 1. 将所有句子转成onehot编码,并保存数据 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 将所有句子转成onehot编码,并保存数据') print('1. 将所有句子转成onehot编码,并保存数据') # -------------- code start : 开始 ------------- from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder # print train_dataA.head() print(train_dataA.shape) feature_encoder = FeatureEncoder(train_data=train_dataA['WORDS'].as_matrix(), verbose=0, padding_mode='none', need_segmented=False, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, remove_url=True, sentence_padding_length=7, add_unkown_word=False, mask_zero=False, zhs2zht=True, ) # printfeature_encoder.train_padding_index train_X_features = feature_encoder.to_onehot_array() np.save('result/train_X_feature', train_X_features) print(train_X_features.shape) print(train_X_features[:5]) vocabulary = feature_encoder.vocabulary print(','.join(vocabulary)) print('字典个数有:%d' % feature_encoder.vocabulary_size) np.save('result/vocabulary', vocabulary) # -------------- code start : 结束 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) # -------------- region end : 1. 将所有句子转成onehot编码,并保存数据 --------------- # -------------- region start : 2. 统计每个词的5种统计值 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 统计每个词的5种统计值') print('2. 统计每个词的5种统计值') # -------------- code start : 开始 ------------- # 总词频 freq = np.sum(train_X_features, axis=0) # favor类中词频 favor_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'FAVOR'], axis=0) # against类中词频 against_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'AGAINST'], axis=0) # none类中词频 none_freq = np.sum(train_X_features[train_dataA['STANCE'].as_matrix() == u'NONE'], axis=0) # 支持度 :最高词频/总词频 support = np.nan_to_num([max(favor, against, none) / (1.0 * (favor + against + none)) for favor, against, none in zip(favor_freq, against_freq, none_freq)]) print(freq) print(favor_freq) print(against_freq) print(none_freq) count_data = pd.DataFrame(data={ u'WORD': vocabulary, u'FAVOR': favor_freq, u'AGAINST': against_freq, u'NONE': none_freq, u'SUPPORT': support, u'FREQ': freq, }) # 排序 count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'], ascending=False) count_data = count_data[[u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT']] # 保存 count_data.to_csv('result/word_count_%d.csv' % feature_encoder.vocabulary_size, sep='\t', index=False, header=True, encoding='utf8', ) print(count_data.head()) # -------------- code start : 结束 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20)
full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, mask_zero=True, ) train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix()) feature_encoder.print_model_descibe() feature_encoder.to_onehot_array() quit() train_y = train_data['LABEL_INDEX'].as_matrix() test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix()) test_all_y = test_data['LABEL_INDEX'].as_matrix() print(train_X_feature.shape) print(test_all_X_feature.shape) logging.debug('=' * 20) # **************************************************************** # ------------- region end : 2. 转换数据的格式并特征编码 ------------- # ****************************************************************
def count_word_freq(self, data): ''' 统计每个词 在各个类别中的次数,每个词有四个统计项: 1. FAVOR: 在favor类别中的出现的次数 2. AGAINST:在AGAINST类别中的出现的次数 3. NONE : 在NONE类别中的出现的次数 4. FREQ : 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE 5. SUPPORT: 最高词频词频项/(FREQ) :param data: :return: ''' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( train_data=data['WORDS'].as_matrix(), verbose=0, padding_mode='none', need_segmented=False, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, remove_url=True, sentence_padding_length=7, add_unkown_word=False, mask_zero=False, zhs2zht=True, ) # print feature_encoder.train_padding_index train_X_features = feature_encoder.to_onehot_array() np.save('result/train_X_feature', train_X_features) print train_X_features.shape print train_X_features[:5] vocabulary = feature_encoder.vocabulary print ','.join(vocabulary) print feature_encoder.vocabulary_size np.save('result/vocabulary', vocabulary) freq = np.sum(train_X_features, axis=0) favor_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'FAVOR'], axis=0) against_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'AGAINST'], axis=0) none_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'NONE'], axis=0) support = np.nan_to_num([ max(favor, against, none) / (1.0 * (favor + against + none)) for favor, against, none in zip(favor_freq, against_freq, none_freq) ]) print freq print favor_freq print against_freq print none_freq count_data = pd.DataFrame( data={ u'WORD': vocabulary, u'FAVOR': favor_freq, u'AGAINST': against_freq, u'NONE': none_freq, u'SUPPORT': support, u'FREQ': freq, }) count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'], ascending=False) count_data = count_data[[ u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT' ]] count_data.to_csv( 'result/word_count.csv', sep='\t', index=False, header=True, encoding='utf8', ) print count_data.head()
def count_word_freq(): ''' 对文件(train_data/TaskAA_all_data_2986.csv)统计词频。 统计每个词 在各个类别中的次数,每个词有五个统计项: 1. FAVOR: 在favor类别中的出现的次数 2. AGAINST:在AGAINST类别中的出现的次数 3. NONE : 在NONE类别中的出现的次数 4. FREQ : 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE 5. SUPPORT: 最高词频词频项/(FREQ) 步骤如下: 1. 将所有句子转成onehot编码 2. 统计每个词的5种统计值 :return: ''' # -------------- region start : 1. 将所有句子转成onehot编码,并保存数据 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 将所有句子转成onehot编码,并保存数据') print('1. 将所有句子转成onehot编码,并保存数据') # -------------- code start : 开始 ------------- from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder # print train_dataA.head() print(train_dataA.shape) feature_encoder = FeatureEncoder( train_data=train_dataA['WORDS'].as_matrix(), verbose=0, padding_mode='none', need_segmented=False, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, remove_url=True, sentence_padding_length=7, add_unkown_word=False, mask_zero=False, zhs2zht=True, ) # printfeature_encoder.train_padding_index train_X_features = feature_encoder.to_onehot_array() np.save('result/train_X_feature', train_X_features) print(train_X_features.shape) print(train_X_features[:5]) vocabulary = feature_encoder.vocabulary print(','.join(vocabulary)) print('字典个数有:%d' % feature_encoder.vocabulary_size) np.save('result/vocabulary', vocabulary) # -------------- code start : 结束 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) # -------------- region end : 1. 将所有句子转成onehot编码,并保存数据 --------------- # -------------- region start : 2. 统计每个词的5种统计值 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 统计每个词的5种统计值') print('2. 统计每个词的5种统计值') # -------------- code start : 开始 ------------- # 总词频 freq = np.sum(train_X_features, axis=0) # favor类中词频 favor_freq = np.sum( train_X_features[train_dataA['STANCE'].as_matrix() == u'FAVOR'], axis=0) # against类中词频 against_freq = np.sum( train_X_features[train_dataA['STANCE'].as_matrix() == u'AGAINST'], axis=0) # none类中词频 none_freq = np.sum( train_X_features[train_dataA['STANCE'].as_matrix() == u'NONE'], axis=0) # 支持度 :最高词频/总词频 support = np.nan_to_num([ max(favor, against, none) / (1.0 * (favor + against + none)) for favor, against, none in zip(favor_freq, against_freq, none_freq) ]) print(freq) print(favor_freq) print(against_freq) print(none_freq) count_data = pd.DataFrame( data={ u'WORD': vocabulary, u'FAVOR': favor_freq, u'AGAINST': against_freq, u'NONE': none_freq, u'SUPPORT': support, u'FREQ': freq, }) # 排序 count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'], ascending=False) count_data = count_data[[ u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT' ]] # 保存 count_data.to_csv( 'result/word_count_%d.csv' % feature_encoder.vocabulary_size, sep='\t', index=False, header=True, encoding='utf8', ) print(count_data.head()) # -------------- code start : 结束 ------------- if verbose > 1: logging.debug('-' * 20) print('-' * 20)