Exemplo n.º 1
0
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    data = scriptdir+'/../data/cwi_training/cwi_training.txt.lbl.conll'
    testdata = scriptdir+'/../data/cwi_testing/cwi_testing.gold.txt.lbl.conll'
    pickled_data = scriptdir+'/../data.pickle'
    parser = argparse.ArgumentParser()
    parser.add_argument('--threshold', '-t', type=float, help='Threshold for predicting 0/1. If not specified, the optimal threshold will first be computed as the median of all CV splits. May take a while.')
    parser.add_argument('--iterations', '-i', type=int, default=50, help='Training iterations.')
    parser.add_argument('--hidden-layers', '-l', dest='layers', required=True, type=int, nargs='+', help='List of layer sizes')
    parser.add_argument('--cv-splits', '-c', dest='splits', type=int, help='No. of crossvalidation splits. If not specified, no CV will be performed.')
    parser.add_argument('--data', '-d', default=data, help='Features and labels')
    parser.add_argument('--testdata', '-y', default=testdata,  help='Test data (not needed for crossval).')
    parser.add_argument('--verbose', '-v', dest='verbose', action='store_true', help='Print average loss at every training iteration.')
    parser.add_argument('--output', '-o', help="Output file")
    parser.add_argument('--features', '-f', dest='features', default=[], type=str, nargs='+', help='List of feature types')

    args = parser.parse_args()
    # X, y = load_pickled(args.data)
    combined_data = 'X_y_all.txt'
    cutoff = combine_data(args.data, args.testdata, combined_data)
    X, y, _ = feats_and_classify.collect_features(combined_data, True, args.features)
    X_tr = X[:cutoff]
    y_tr = y[:cutoff]
    X_te = X[cutoff:]
    y_te = y[cutoff:]
    conf = NeuralNetConfig(X=X, y=y, layers=args.layers, iterations=args.iterations, verbose=args.verbose)

    if args.splits:
        if args.threshold:
            crossval(X_tr,y_tr,args.splits, conf, t=args.threshold)
        else:
            # compute optimal threshold for each CV split
            print '### Computing optimal threshold... '
            ts = crossval(X_tr,y_tr,args.splits, conf)
            avg = np.average(ts)
            med = np.median(ts)
            print '\nThresholds for crossval splits:', ts
            print 'Mean threshold', avg
            print 'Median threshold', med
            print 'Threshold st.dev.', np.std(ts)
            # Run CV with fixed avg/median threshold
            print '\n\n### Running with avg. threshold... '
            crossval(X_tr,y_tr,args.splits, conf, t=avg)
            print '\n\n### Running with med. threshold... '
            crossval(X_tr,y_tr,args.splits, conf, t=med)
    else:
        
        nn = NN(conf)
        nn.train(X_tr,y_tr,args.iterations)
        if args.testdata:
            # X_test, y_test = load_pickled(args.testdata)
            pred = nn.get_output(X_te)
            if args.output:
                with open(args.output, 'w') as of:
                    for p in pred:
                        of.write('%f\n'%p)
            t, res = nn.test(X_te,y_te,args.threshold)
            resout = "G: %f, R: %f, A: %f, P: %f\n"%res
            sys.stderr.write('%s %f\n'%(' '.join(args.features), t))
            sys.stderr.write(resout)
Exemplo n.º 2
0
def	predictTestSet():
	#generate training features and labels
	trainfile='/home/natschluter/GroupAlgorithms/cwi2016/data/cwi_training/cwi_training_cat.lbl.conll'
	trainfeatures, trainlabels, vec = feats_and_classify_py2.collect_features(trainfile)
	#generate training+test features
	
	bothfiles='/home/natschluter/GroupAlgorithms/cwi2016/data/train_and_test1.conll'
	bothfeatures, bothlabels, bothvec = feats_and_classify_py2.collect_features(bothfiles)
	thresholds_med=np.median(np.array([ 0.145,  0.85,   0.12,   0.657,  0.71,   0.824,  0.506,  0.461,  0.662,  0.888]))
	
	TrainX=bothfeatures[np.array(range(len(trainfeatures)))]
	TrainY=bothlabels[np.array(range(len(trainlabels)))]
	TestX=bothfeatures[np.array(range(len(trainfeatures),len(bothfeatures)))]
	maxent = LogisticRegression(penalty='l2')
	print('training...')
	maxent.fit(TrainX,TrainY)
	print('predicting...')
	ypred_probs=maxent.predict_proba(TestX)
Exemplo n.º 3
0
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    data = scriptdir + '/../data/cwi_training/cwi_training.txt.lbl.conll'
    testdata = scriptdir + '/../data/cwi_testing/cwi_testing.gold.txt.lbl.conll'
    pickled_data = scriptdir + '/../data.pickle'
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--threshold',
        '-t',
        type=float,
        help=
        'Threshold for predicting 0/1. If not specified, the optimal threshold will first be computed as the median of all CV splits. May take a while.'
    )
    parser.add_argument('--iterations',
                        '-i',
                        type=int,
                        default=50,
                        help='Training iterations.')
    parser.add_argument('--hidden-layers',
                        '-l',
                        dest='layers',
                        required=True,
                        type=int,
                        nargs='+',
                        help='List of layer sizes')
    parser.add_argument(
        '--cv-splits',
        '-c',
        dest='splits',
        type=int,
        help=
        'No. of crossvalidation splits. If not specified, no CV will be performed.'
    )
    parser.add_argument('--data',
                        '-d',
                        default=data,
                        help='Features and labels')
    parser.add_argument('--testdata',
                        '-y',
                        default=testdata,
                        help='Test data (not needed for crossval).')
    parser.add_argument('--verbose',
                        '-v',
                        dest='verbose',
                        action='store_true',
                        help='Print average loss at every training iteration.')
    parser.add_argument('--output', '-o', help="Output file")
    parser.add_argument('--features',
                        '-f',
                        dest='features',
                        default=[],
                        type=str,
                        nargs='+',
                        help='List of feature types')

    args = parser.parse_args()
    # X, y = load_pickled(args.data)
    combined_data = 'X_y_all.txt'
    cutoff = combine_data(args.data, args.testdata, combined_data)
    X, y, _ = feats_and_classify.collect_features(combined_data, True,
                                                  args.features)
    X_tr = X[:cutoff]
    y_tr = y[:cutoff]
    X_te = X[cutoff:]
    y_te = y[cutoff:]
    conf = NeuralNetConfig(X=X,
                           y=y,
                           layers=args.layers,
                           iterations=args.iterations,
                           verbose=args.verbose)

    if args.splits:
        if args.threshold:
            crossval(X_tr, y_tr, args.splits, conf, t=args.threshold)
        else:
            # compute optimal threshold for each CV split
            print '### Computing optimal threshold... '
            ts = crossval(X_tr, y_tr, args.splits, conf)
            avg = np.average(ts)
            med = np.median(ts)
            print '\nThresholds for crossval splits:', ts
            print 'Mean threshold', avg
            print 'Median threshold', med
            print 'Threshold st.dev.', np.std(ts)
            # Run CV with fixed avg/median threshold
            print '\n\n### Running with avg. threshold... '
            crossval(X_tr, y_tr, args.splits, conf, t=avg)
            print '\n\n### Running with med. threshold... '
            crossval(X_tr, y_tr, args.splits, conf, t=med)
    else:

        nn = NN(conf)
        nn.train(X_tr, y_tr, args.iterations)
        if args.testdata:
            # X_test, y_test = load_pickled(args.testdata)
            pred = nn.get_output(X_te)
            if args.output:
                with open(args.output, 'w') as of:
                    for p in pred:
                        of.write('%f\n' % p)
            t, res = nn.test(X_te, y_te, args.threshold)
            resout = "G: %f, R: %f, A: %f, P: %f\n" % res
            sys.stderr.write('%s %f\n' % (' '.join(args.features), t))
            sys.stderr.write(resout)