parser.add_argument('labels', type=load_npz, help='Training labels (csv)') parser.add_argument('matches', type=argparse.FileType('r'), help='Matched test/train ids (tsv)') parser.add_argument('output', help='Output overrides (npz)') return parser if __name__ == "__main__": args = opts().parse_args() logging.info("Making lookup table for row ids") labels = args.labels['labels'] rlookup = {id: row for row, id in enumerate(args.labels['ids'])} ids = [] Y = [] logging.info("Getting mean labels for matched ids") for line in args.matches: test_id, train_ids = line.rstrip().split('\t', 1) ids.append(int(test_id)) train_ids = [rlookup[int(id)] for id in train_ids.split(',')] y_hat = labels[train_ids].mean(axis=0) Y.append(y_hat) ids = np.asarray(ids) Y = np.asarray(Y) logging.info("Saving overridden predictions to %s" % args.output) save_npz(args.output, ids=ids, labels=Y, header=args.labels['header'])
parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('train', type=load_encoded_features, help='Training features (npz)') parser.add_argument('labels', type=load_npz, help='Training labels (npz)') parser.add_argument('test', type=load_encoded_features, help='Test features (npz)') parser.add_argument('output', help='Output label predictions (npz)') parser.add_argument('--classifiers', type=argparse.FileType('w'), help='Save fit classifiers (pkl)') return parser if __name__ == "__main__": args = opts().parse_args() logging.info("Preparing features") clf = TSEnsembleClassifier() logging.info("Fitting classifier(s)") clf.fit(args.train, args.labels['labels']) if args.classifiers: logging.info("Saving classifiers") pickle.dump(clf, args.classifiers) logging.info("Predicting for test data") Y = clf.predict(args.test) logging.info("Saving predictions to %s" % args.output) save_npz(args.output, ids=args.test['ids'], header=args.labels['header'], labels=Y)
if __name__ == "__main__": args = opts().parse_args() logging.info("Making lookup table for row ids") labels = args.labels['labels'] rlookup = {id: row for row, id in enumerate(args.labels['ids'])} ids = [] Y = [] Y_train = [] Y_train_override = [] logging.info("Getting mean labels for matched ids") for line in args.matches: test_id, train_ids = line.rstrip().split('\t', 1) ids.append(int(test_id)) train_ids = [rlookup[int(id)] for id in train_ids.split(',')] y_hat = labels[train_ids].mean(axis=0) Y.append(y_hat) Y_train.append(labels[train_ids]) Y_train_override.append(np.tile(y_hat, (len(train_ids), 1))) ids = np.asarray(ids) Y = np.asarray(Y) Y_train = np.vstack(Y_train) Y_train_override = np.vstack(Y_train_override) logging.info("Override cost = %f" \ % score_predictions(Y_train, Y_train_override)) logging.info("Saving overridden predictions to %s" % args.output) save_npz(args.output, ids=ids, labels=Y, header=args.labels['header'])
import argparse, logging, os import numpy as np from common import load_npz, save_npz logging.basicConfig(level=logging.DEBUG) def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('features', type=load_npz, help='Features (npz)') parser.add_argument('output', help='Split features (npz)') parser.add_argument('-n', type=int, required=True, help='Number of files to split into') return parser if __name__ == "__main__": args = opts().parse_args() basename, ext = os.path.splitext(args.output) pattern = basename + '.%d' + ext logging.info("Loading data") data = dict(args.features) nrows = data['ids'].shape[0] nrows_per_file = nrows / args.n logging.info("Writing splits to output") for i in xrange(args.n): logging.info(i) split = {k: v[i*nrows_per_file:(i+1)*nrows_per_file] for k, v in data.iteritems()} save_npz(pattern % i, **split)
logging.info(i) C = CFG.get(i, 5.0) clf = LogisticRegression(C=C, tol=0.0001, random_state=42) if len(np.unique(y)) == 1: Y_test.append(y[0]*np.ones(args.test.shape[0])) Y_meta.append(y[0]*np.ones(args.train.shape[0])) else: logging.info("Fitting") clf.fit(X_train, y) logging.info("Predicting") p = clf.predict_proba(args.test) y = 1 - p[:,0] Y_test.append(y) p = clf.predict_proba(args.train) y = 1 - p[:,0] Y_meta.append(y) logging.info("Saving predictions to %s" % args.output) test = load_npz('../../data/test.npz') Y_test = np.vstack(Y_test).T save_npz(args.output, ids=test['ids'], header=args.labels['header'], labels=Y_test) del Y_test, test logging.info("Saving predictions to %s" % args.meta) Y_meta = np.vstack(Y_meta).T save_npz(args.meta, ids=args.labels['ids'], header=args.labels['header'], labels=Y_meta)
id = int(entry[1]) ids.append(id) y_i = float(entry[0]) y.append(y_i) ids = np.array(ids) y = np.array(y) p = sigmoid(y) return ids, p def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('pred', type=glob.glob, help='VW predictions pattern') parser.add_argument('output', help='Output file (npz)') return parser if __name__ == "__main__": args = opts().parse_args() print "Loading VW predictions" ids = load_vw_predictions(open(args.pred[0]))[0] print "Loading predictions for %d ids" % len(ids) Y = np.zeros((len(ids), 33)) for fn in args.pred: i = int(fn.split('.')[0].split('_')[-1][1:]) print "Label %d" % i Y[:, i] = load_vw_predictions(open(fn))[1] labels = load_npz('../../data/trainLabels.npz') header = labels['header'] save_npz(args.output, header=header, ids=ids, labels=Y)
y_i = float(entry[0]) y.append(y_i) ids = np.array(ids) y = np.array(y) p = sigmoid(y) return ids, p def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('pred', type=glob.glob, help='VW predictions pattern') parser.add_argument('output', help='Output file (npz)') return parser if __name__ == "__main__": args = opts().parse_args() print "Loading VW predictions" ids = load_vw_predictions(open(args.pred[0]))[0] print "Loading predictions for %d ids" % len(ids) Y = np.zeros((len(ids), 33)) for fn in args.pred: i = int(fn.split('.')[0].split('_')[-1][1:]) print "Label %d" % i Y[:,i] = load_vw_predictions(open(fn))[1] labels = load_npz('../../data/trainLabels.npz') header = labels['header'] save_npz(args.output, header=header, ids=ids, labels=Y)
def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('predictions', type=load_npz, help='Predicted labels (npz)') parser.add_argument('overrides', type=load_npz, help='Overrides labels (npz)') parser.add_argument('--col', type=int, action='append', required=True, help='Columns to override') parser.add_argument('output', help='Output label predictions (npz)') return parser if __name__ == "__main__": args = opts().parse_args() Y = args.predictions['labels'] overrides = args.overrides['labels'] for col in args.col: logging.info("Overriding column %d" % col) Y[:, col] = overrides[:, col] logging.info("Saving overridden predictions to %s" % args.output) save_npz(args.output, ids=args.predictions['ids'], header=args.predictions['header'], labels=Y)
''' import argparse, logging from common import load_npz, save_npz logging.basicConfig(level=logging.DEBUG) def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('predictions', type=load_npz, help='Predicted labels (npz)') parser.add_argument('overrides', type=load_npz, help='Overrides labels (npz)') parser.add_argument('--col', type=int, action='append', required=True, help='Columns to override') parser.add_argument('output', help='Output label predictions (npz)') return parser if __name__ == "__main__": args = opts().parse_args() Y = args.predictions['labels'] overrides = args.overrides['labels'] for col in args.col: logging.info("Overriding column %d" % col) Y[:,col] = overrides[:,col] logging.info("Saving overridden predictions to %s" % args.output) save_npz(args.output, ids=args.predictions['ids'], header=args.predictions['header'], labels=Y)
clf = LogisticRegression(C=C, tol=0.0001, random_state=42) if len(np.unique(y)) == 1: Y_test.append(y[0] * np.ones(args.test.shape[0])) Y_meta.append(y[0] * np.ones(args.train.shape[0])) else: logging.info("Fitting") clf.fit(X_train, y) logging.info("Predicting") p = clf.predict_proba(args.test) y = 1 - p[:, 0] Y_test.append(y) p = clf.predict_proba(args.train) y = 1 - p[:, 0] Y_meta.append(y) logging.info("Saving predictions to %s" % args.output) test = load_npz('../../data/test.npz') Y_test = np.vstack(Y_test).T save_npz(args.output, ids=test['ids'], header=args.labels['header'], labels=Y_test) del Y_test, test logging.info("Saving predictions to %s" % args.meta) Y_meta = np.vstack(Y_meta).T save_npz(args.meta, ids=args.labels['ids'], header=args.labels['header'], labels=Y_meta)