feature_encoder = FeatureEncoder( verbose=config['verbose'], need_segmented=True, full_mode=config['full_mode'], remove_stopword=config['remove_stopword'], replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method=config['model'], max_features=config['max_features'], ) train_X_feature = feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix()) test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) print(train_X_feature) feature_encoder.print_model_descibe() keywords = feature_encoder.vocabulary print ','.join(keywords) print len(keywords) else: feature_encoder = None logging.debug('fit X shape is :%s'%(str(train_X_feature.shape))) print('fit X shape is :%s'%(str(train_X_feature.shape))) print(train_X_feature[0]) # ------------------------------------------------------------------------------ # -------------- region end : 将数据转为特征 ------------- # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
verbose=0, need_segmented=True, full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type=feature_type, max_features=2000, ) train_X_feature = feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix()) test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) fout.write('%s\n'%feature_encoder.print_model_descibe()) print(feature_encoder.vocabulary_size) print(','.join(feature_encoder.vocabulary)) # print(train_X_feature) # print(test_X_feature) logging.debug('fit X shape is :%s'%(str(train_X_feature.shape))) print('fit X shape is :%s' % (str(train_X_feature.shape))) print('test X shape is :%s'%(str(test_X_feature.shape))) test_y = test_data['LABEL_INDEX'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 将数据转为特征 ------------- # ------------------------------------------------------------------------------