Exemplo n.º 1
0
def vocab_table(name):
    exp_dir = REMOTE_PARSERS.join(name)
    with cd(str(exp_dir)):
        conditions = run("ls %s" % exp_dir, quiet=True).split()
        for condition in sorted(conditions):
            accs = get_accs(str(exp_dir.join(condition)))
            print condition, len(accs), sum(accs) / len(accs)
Exemplo n.º 2
0
def vocab_thresholds(name, k=8, n=1, size=10000):
    base_dir = REMOTE_PARSERS.join(name)
    n = int(n)
    k = int(k)
    size = int(size)
    data = str(REMOTE_STANFORD)
    repo = str(REMOTE_REPO)
    train_name = 'train.txt'
    eval_pos = 'devi.txt' 
    eval_parse = 'devr.txt'
 
    thresholds = [75]
    ngram_sizes = [60, 90, 120]
    for n_ngrams in ngram_sizes:
        if n_ngrams == 0:
            feat_name = 'zhang'
        else:
            feat_name = 'full'
        exp_dir = str(base_dir.join('%d_ngrams' % n_ngrams))
        #if n_ngrams < 100:
        #    train_n(n, 'unpruned', exp_dir, data, k=k, i=15, t=0, f=0,
        #            train_alg="max", label="Stanford", n_sents=size, feat_str=feat_name)
        for t in thresholds:
            thresh = 'thresh%d' % t
            train_n(n, thresh, exp_dir, data, k=k, i=15, t=t, f=100,
                    train_alg='max', label="Stanford", n_sents=size,
                    feat_str=feat_name, ngrams=n_ngrams)
Exemplo n.º 3
0
def bitable(name):
    exp_dir = REMOTE_PARSERS.join(name)
    base_accs = get_accs(str(exp_dir.join('0_S0_N0')))
    base_acc = sum(base_accs) / len(base_accs)
    print "Base:", len(base_accs), sum(base_accs) / len(base_accs)
    results = []
    with cd(str(exp_dir)):
        ngrams = run("ls %s" % exp_dir, quiet=True).split()
        for ngram in sorted(ngrams):
            if ngram == 'base' or ngram == '0_S0_N0':
                continue
            accs = get_accs(str(exp_dir.join(ngram)))
            print ngram, len(accs)
            if not accs:
                continue
            _, avg, stdev = _get_stdev(accs)
            z, p = scipy.stats.wilcoxon(accs, base_accs)
            parts = ngram.split('_')
            if ngram.startswith('base'):
                base_acc = avg
            else:
                results.append((avg, ngram, stdev, p))
    good_ngrams = []
    results.sort()
    results.reverse()
    for acc, ngram, stdev, p in results:
        ngram = '_'.join(ngram.split('_')[1:])
        if acc > base_acc and p < 0.01:
            print r'%s & %.3f & %.3f \\' % (ngram, acc - base_acc, p)
            good_ngrams.append(ngram)
    print good_ngrams
    print len(good_ngrams)
Exemplo n.º 4
0
def vocab_table(name):
    exp_dir = REMOTE_PARSERS.join(name)
    with cd(str(exp_dir)):
        conditions = run("ls %s" % exp_dir, quiet=True).split()
        for condition in sorted(conditions):
            accs = get_accs(str(exp_dir.join(condition)))
            print condition, len(accs), sum(accs) / len(accs)
Exemplo n.º 5
0
def vocab_thresholds(name, k=8, n=1, size=10000):
    base_dir = REMOTE_PARSERS.join(name)
    n = int(n)
    k = int(k)
    size = int(size)
    data = str(REMOTE_STANFORD)
    repo = str(REMOTE_REPO)
    train_name = 'train.txt'
    eval_pos = 'devi.txt' 
    eval_parse = 'devr.txt'
 
    thresholds = [75]
    ngram_sizes = [60, 90, 120]
    for n_ngrams in ngram_sizes:
        if n_ngrams == 0:
            feat_name = 'zhang'
        else:
            feat_name = 'full'
        exp_dir = str(base_dir.join('%d_ngrams' % n_ngrams))
        #if n_ngrams < 100:
        #    train_n(n, 'unpruned', exp_dir, data, k=k, i=15, t=0, f=0,
        #            train_alg="max", label="Stanford", n_sents=size, feat_str=feat_name)
        for t in thresholds:
            thresh = 'thresh%d' % t
            train_n(n, thresh, exp_dir, data, k=k, i=15, t=t, f=100,
                    train_alg='max', label="Stanford", n_sents=size,
                    feat_str=feat_name, ngrams=n_ngrams)
Exemplo n.º 6
0
def bitable(name):
    exp_dir = REMOTE_PARSERS.join(name)
    base_accs = get_accs(str(exp_dir.join('0_S0_N0')))
    base_acc = sum(base_accs) / len(base_accs)
    print "Base:", len(base_accs), sum(base_accs) / len(base_accs)
    results = []
    with cd(str(exp_dir)):
        ngrams = run("ls %s" % exp_dir, quiet=True).split()
        for ngram in sorted(ngrams):
            if ngram == 'base' or ngram == '0_S0_N0':
                continue
            accs = get_accs(str(exp_dir.join(ngram)))
            print ngram, len(accs)
            if not accs:
                continue
            _, avg, stdev = _get_stdev(accs)
            z, p = scipy.stats.wilcoxon(accs, base_accs)
            parts = ngram.split('_')
            if ngram.startswith('base'):
                base_acc = avg
            else:
                results.append((avg, ngram, stdev, p))
    good_ngrams = []
    results.sort()
    results.reverse()
    for acc, ngram, stdev, p in results:
        ngram = '_'.join(ngram.split('_')[1:])
        if acc > base_acc and p < 0.01:
            print r'%s & %.3f & %.3f \\' % (ngram, acc - base_acc, p)
            good_ngrams.append(ngram)
    print good_ngrams
    print len(good_ngrams)