示例#1
0
def train(model_name, category_type, dump=False):
    clf = tfidf_pipeline.make(model_name)

    categories = names.categories[category_type]

    print 'Loading data...'
    data = data_loader.load('full', categories)
    train_X, train_y, test_X, test_y = data_loader.split(data, 0.1)
    print 'Done.'

    print 'Training...'
    clf.fit(train_X, train_y)
    print 'Done.'

    print 'Testing...'
    predicted = clf.predict(test_X)

    if model_name in ['svr', 'linreg']:
        predicted = np.clip(np.round(predicted), 0, 7)
        accuracy = scorers.err1(test_y, predicted)
        print 'Off-by-one accuracy: ' +  str(accuracy)
    else:
        accuracy = scorers.err0(test_y, predicted)
        print 'Exact accuracy: ' +  str(accuracy)
        print classification_report(test_y, predicted, target_names=categories)
    cm = confusion_matrix(test_y, predicted)
    print cm
    plot.plot_confusion_matrix(cm, category_type)

    if dump:
        print 'Saving classifier...'
        if not exists('dumps'):
            makedirs('dumps')
        joblib.dump(clf, join('dumps', category_type + '_' + model_name + '_classifier.pkl'))
        print 'Done.'

    return clf
import sys

from sklearn.externals.joblib import dump

import data_loader
import names
import tfidf_pipeline
import model_presets


if __name__ == '__main__':
    for (category_name, model_name) in [('stars', 'linreg'), ('binary', 'svc')]:

        print 'Loading ' + category_name + ' data'
        train,_ = data_loader.load('split', names.categories[category_name])

        print 'Training ' + model_name
        clf = tfidf_pipeline.make(model_name)
        clf.fit(train.data, train.target)

        print 'Dumping ' + model_name
        dump(clf, 'web_clf_' + category_name + '.pkl')