parser.add_argument('labels', type=load_npz,
        help='Training labels (csv)')
    parser.add_argument('matches', type=argparse.FileType('r'),
        help='Matched test/train ids (tsv)')
    parser.add_argument('output',
        help='Output overrides (npz)')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    logging.info("Making lookup table for row ids")
    labels = args.labels['labels']
    rlookup = {id: row for row, id in enumerate(args.labels['ids'])}
    
    ids = []
    Y = []
    logging.info("Getting mean labels for matched ids")
    for line in args.matches:
        test_id, train_ids = line.rstrip().split('\t', 1)
        ids.append(int(test_id))
        train_ids = [rlookup[int(id)] for id in train_ids.split(',')]
        y_hat = labels[train_ids].mean(axis=0)
        Y.append(y_hat)
        
    ids = np.asarray(ids)
    Y = np.asarray(Y)
    
    logging.info("Saving overridden predictions to %s" % args.output)
    save_npz(args.output, ids=ids, labels=Y,
        header=args.labels['header'])
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('train', type=load_encoded_features,
        help='Training features (npz)')
    parser.add_argument('labels', type=load_npz,
        help='Training labels (npz)')
    parser.add_argument('test', type=load_encoded_features,
        help='Test features (npz)')
    parser.add_argument('output',
        help='Output label predictions (npz)')
    parser.add_argument('--classifiers', type=argparse.FileType('w'),
        help='Save fit classifiers (pkl)')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    logging.info("Preparing features")
    clf = TSEnsembleClassifier()
    logging.info("Fitting classifier(s)")
    clf.fit(args.train, args.labels['labels'])
    
    if args.classifiers:
        logging.info("Saving classifiers")
        pickle.dump(clf, args.classifiers)
    
    logging.info("Predicting for test data")
    Y = clf.predict(args.test)
    
    logging.info("Saving predictions to %s" % args.output)
    save_npz(args.output, ids=args.test['ids'], 
        header=args.labels['header'], labels=Y)
示例#3
0
if __name__ == "__main__":
    args = opts().parse_args()

    logging.info("Making lookup table for row ids")
    labels = args.labels['labels']
    rlookup = {id: row for row, id in enumerate(args.labels['ids'])}

    ids = []
    Y = []
    Y_train = []
    Y_train_override = []
    logging.info("Getting mean labels for matched ids")
    for line in args.matches:
        test_id, train_ids = line.rstrip().split('\t', 1)
        ids.append(int(test_id))
        train_ids = [rlookup[int(id)] for id in train_ids.split(',')]
        y_hat = labels[train_ids].mean(axis=0)
        Y.append(y_hat)
        Y_train.append(labels[train_ids])
        Y_train_override.append(np.tile(y_hat, (len(train_ids), 1)))

    ids = np.asarray(ids)
    Y = np.asarray(Y)
    Y_train = np.vstack(Y_train)
    Y_train_override = np.vstack(Y_train_override)
    logging.info("Override cost = %f" \
        % score_predictions(Y_train, Y_train_override))

    logging.info("Saving overridden predictions to %s" % args.output)
    save_npz(args.output, ids=ids, labels=Y, header=args.labels['header'])
示例#4
0
import argparse, logging, os
import numpy as np
from common import load_npz, save_npz

logging.basicConfig(level=logging.DEBUG)

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('features', type=load_npz,
        help='Features (npz)')
    parser.add_argument('output',
        help='Split features (npz)')
    parser.add_argument('-n', type=int, required=True,
        help='Number of files to split into')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    basename, ext = os.path.splitext(args.output)
    pattern = basename + '.%d' + ext
    logging.info("Loading data")
    data = dict(args.features)
    nrows = data['ids'].shape[0]
    nrows_per_file = nrows / args.n
    logging.info("Writing splits to output")
    for i in xrange(args.n):
        logging.info(i)
        split = {k: v[i*nrows_per_file:(i+1)*nrows_per_file]
                 for k, v in data.iteritems()}
        save_npz(pattern % i, **split)   
            
        logging.info(i)
        C = CFG.get(i, 5.0)
        clf = LogisticRegression(C=C, tol=0.0001, random_state=42)
        if len(np.unique(y)) == 1:
            Y_test.append(y[0]*np.ones(args.test.shape[0]))
            Y_meta.append(y[0]*np.ones(args.train.shape[0]))
        else:
            logging.info("Fitting")
            clf.fit(X_train, y)
            logging.info("Predicting")
            p = clf.predict_proba(args.test)
            y = 1 - p[:,0]
            Y_test.append(y)
            p = clf.predict_proba(args.train)
            y = 1 - p[:,0]
            Y_meta.append(y)
            
    logging.info("Saving predictions to %s" % args.output)
    test = load_npz('../../data/test.npz')
    Y_test = np.vstack(Y_test).T
    save_npz(args.output, ids=test['ids'], 
        header=args.labels['header'], labels=Y_test)
    del Y_test, test
        
    logging.info("Saving predictions to %s" % args.meta)
    Y_meta = np.vstack(Y_meta).T
    save_npz(args.meta, ids=args.labels['ids'], 
        header=args.labels['header'], labels=Y_meta)
        
示例#6
0
        id = int(entry[1])
        ids.append(id)
        y_i = float(entry[0])
        y.append(y_i)
    ids = np.array(ids)
    y = np.array(y)
    p = sigmoid(y)
    return ids, p


def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('pred', type=glob.glob, help='VW predictions pattern')
    parser.add_argument('output', help='Output file (npz)')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()
    print "Loading VW predictions"
    ids = load_vw_predictions(open(args.pred[0]))[0]
    print "Loading predictions for %d ids" % len(ids)
    Y = np.zeros((len(ids), 33))
    for fn in args.pred:
        i = int(fn.split('.')[0].split('_')[-1][1:])
        print "Label %d" % i
        Y[:, i] = load_vw_predictions(open(fn))[1]
    labels = load_npz('../../data/trainLabels.npz')
    header = labels['header']
    save_npz(args.output, header=header, ids=ids, labels=Y)
        y_i = float(entry[0])
        y.append(y_i)
    ids = np.array(ids)
    y = np.array(y)
    p = sigmoid(y)
    return ids, p

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('pred', type=glob.glob,
        help='VW predictions pattern')
    parser.add_argument('output',
        help='Output file (npz)')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    print "Loading VW predictions"
    ids = load_vw_predictions(open(args.pred[0]))[0]
    print "Loading predictions for %d ids" % len(ids)
    Y = np.zeros((len(ids), 33))
    for fn in args.pred:
        i = int(fn.split('.')[0].split('_')[-1][1:])
        print "Label %d" % i
        Y[:,i] = load_vw_predictions(open(fn))[1]
    labels = load_npz('../../data/trainLabels.npz')
    header = labels['header']
    save_npz(args.output, header=header, ids=ids, labels=Y)
    
        
def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('predictions',
                        type=load_npz,
                        help='Predicted labels (npz)')
    parser.add_argument('overrides',
                        type=load_npz,
                        help='Overrides labels (npz)')
    parser.add_argument('--col',
                        type=int,
                        action='append',
                        required=True,
                        help='Columns to override')
    parser.add_argument('output', help='Output label predictions (npz)')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()

    Y = args.predictions['labels']
    overrides = args.overrides['labels']
    for col in args.col:
        logging.info("Overriding column %d" % col)
        Y[:, col] = overrides[:, col]

    logging.info("Saving overridden predictions to %s" % args.output)
    save_npz(args.output,
             ids=args.predictions['ids'],
             header=args.predictions['header'],
             labels=Y)
'''

import argparse, logging
from common import load_npz, save_npz

logging.basicConfig(level=logging.DEBUG)

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('predictions', type=load_npz,
        help='Predicted labels (npz)')
    parser.add_argument('overrides', type=load_npz,
        help='Overrides labels (npz)')
    parser.add_argument('--col', type=int, action='append',
        required=True, help='Columns to override')
    parser.add_argument('output',
        help='Output label predictions (npz)')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    Y = args.predictions['labels']
    overrides = args.overrides['labels']
    for col in args.col:
        logging.info("Overriding column %d" % col)
        Y[:,col] = overrides[:,col]
    
    logging.info("Saving overridden predictions to %s" % args.output)
    save_npz(args.output, ids=args.predictions['ids'], 
        header=args.predictions['header'], labels=Y)
示例#10
0
        clf = LogisticRegression(C=C, tol=0.0001, random_state=42)
        if len(np.unique(y)) == 1:
            Y_test.append(y[0] * np.ones(args.test.shape[0]))
            Y_meta.append(y[0] * np.ones(args.train.shape[0]))
        else:
            logging.info("Fitting")
            clf.fit(X_train, y)
            logging.info("Predicting")
            p = clf.predict_proba(args.test)
            y = 1 - p[:, 0]
            Y_test.append(y)
            p = clf.predict_proba(args.train)
            y = 1 - p[:, 0]
            Y_meta.append(y)

    logging.info("Saving predictions to %s" % args.output)
    test = load_npz('../../data/test.npz')
    Y_test = np.vstack(Y_test).T
    save_npz(args.output,
             ids=test['ids'],
             header=args.labels['header'],
             labels=Y_test)
    del Y_test, test

    logging.info("Saving predictions to %s" % args.meta)
    Y_meta = np.vstack(Y_meta).T
    save_npz(args.meta,
             ids=args.labels['ids'],
             header=args.labels['header'],
             labels=Y_meta)