예제 #1
0
    feature_encoder = FeatureEncoder(
                                     verbose=config['verbose'],
                                     need_segmented=True,
                                     full_mode=config['full_mode'],
                                     remove_stopword=config['remove_stopword'],
                                     replace_number=True,
                                     lowercase=True,
                                     zhs2zht=True,
                                     remove_url=True,
                                     feature_method=config['model'],
                                     max_features=config['max_features'],
                                     )
    train_X_feature = feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix())
    test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix())
    print(train_X_feature)
    feature_encoder.print_model_descibe()
    keywords = feature_encoder.vocabulary
    print ','.join(keywords)
    print len(keywords)
else:
    feature_encoder = None
logging.debug('fit X shape is :%s'%(str(train_X_feature.shape)))
print('fit X shape is :%s'%(str(train_X_feature.shape)))

print(train_X_feature[0])

# ------------------------------------------------------------------------------
# -------------- region end : 将数据转为特征 -------------
# ------------------------------------------------------------------------------

# ------------------------------------------------------------------------------
예제 #2
0
    verbose=0,
    need_segmented=True,
    full_mode=True,
    remove_stopword=False,
    replace_number=True,
    lowercase=True,
    zhs2zht=True,
    remove_url=True,
    feature_method='bow',
    feature_type=feature_type,
    max_features=2000,
)

train_X_feature = feature_encoder.fit_transform(train_data[u'SENTENCE'].as_matrix())
test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix())
fout.write('%s\n'%feature_encoder.print_model_descibe())

print(feature_encoder.vocabulary_size)
print(','.join(feature_encoder.vocabulary))
# print(train_X_feature)
# print(test_X_feature)

logging.debug('fit X shape is :%s'%(str(train_X_feature.shape)))
print('fit X shape is :%s' % (str(train_X_feature.shape)))
print('test X shape is :%s'%(str(test_X_feature.shape)))
test_y = test_data['LABEL_INDEX'].as_matrix()
train_y = train_data['LABEL_INDEX'].as_matrix()

# ------------------------------------------------------------------------------
# -------------- region end : 将数据转为特征 -------------
# ------------------------------------------------------------------------------