from pyutils.ensemble_selection.Ensemble import Ensemble from scipy.spatial.distance import hamming print print 'Reading data...' Id_test, X_test = extract_testing_data(args.kaggle_root + '/data/kaggle_test_tf_idf.csv') print print 'Loading ensemble...' ensemble = joblib.load(args.ensemble_dir + '/ensemble.pkl') if args.hill_predict: print print 'Ensemble hill predicting...' Id_train, X_train, y_train = extract_training_data( args.kaggle_root + '/data/kaggle_train_tf_idf.csv') err = hamming(ensemble.hill_predict(X_train), y_train) print '\tHill climbing error: {0}'.format(err) print print 'Predicting...' pred = ensemble.predict(X_test) with open(args.submission_fname, 'w') as f: f.write('Id,Prediction\n') for i, j in zip(Id_test.astype(int).tolist(), pred.tolist()): f.write('{0},{1}\n'.format(i, j)) print '\tOutput written to {0}.'.format(args.submission_fname) print print 'Done.\n'
parser = argparse.ArgumentParser() parser.add_argument('penalty', help='L1 or L2 penalization.') parser.add_argument('C', help='Regularization parameter.') parser.add_argument('fit_intercept', help='Whether to include a constant bias term in the loss function.') parser.add_argument('n_components_pca', help='n_components for PCA.') parser.add_argument('model_fname', help='Absolute path to pickle the fitted CvModel.') args = parser.parse_args() n_components_pca = None if args.n_components_pca == 'None' else int(args.n_components_pca) Id, X, y = extract_training_data('/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv') # trans/clf specs n_folds = 5 scaler = StandardScaler() pca = PCA(n_components=n_components_pca, whiten=True) trans = Pipeline([('scale_center', scaler),('pca', pca)]).fit(X) clf = LogisticRegression( penalty = args.penalty, C = float(args.C), fit_intercept = bool(args.fit_intercept), class_weight = 'auto' ) cv_clf = CvModel(n_folds, trans, clf) cv_clf.fit(X, y)
import os import re from operator import itemgetter from pyutils.kaggle_io.extract_inputs import extract_training_data from pyutils.ensemble_selection.CvModel import CvModel from sklearn.externals import joblib from scipy.spatial.distance import hamming model_dirs = [] with open(args.model_dirlist_fname, 'r') as f: for line in f: model_dirs.append(line.strip()) print 'Reading training data.\n' Id, X, y = extract_training_data(args.kaggle_root + '/data/kaggle_train_tf_idf.csv') print 'Scoring models.' hillclimb_errs = [] prog, model_rootdir = re.compile('.*\.pkl$'), args.kaggle_root + '/models' for m_dir in model_dirs: dir_contents = os.listdir(model_rootdir + '/' + m_dir) for fname in dir_contents: try: model_name = m_dir + '/' + prog.match(fname).group(0) print '\t{0}'.format(model_name) model = joblib.load(model_rootdir + '/' + model_name) err = hamming(model.hill_predict(X), y) hillclimb_errs.append((model_name, err)) except AttributeError: