def vocab_table(name): exp_dir = REMOTE_PARSERS.join(name) with cd(str(exp_dir)): conditions = run("ls %s" % exp_dir, quiet=True).split() for condition in sorted(conditions): accs = get_accs(str(exp_dir.join(condition))) print condition, len(accs), sum(accs) / len(accs)
def vocab_thresholds(name, k=8, n=1, size=10000): base_dir = REMOTE_PARSERS.join(name) n = int(n) k = int(k) size = int(size) data = str(REMOTE_STANFORD) repo = str(REMOTE_REPO) train_name = 'train.txt' eval_pos = 'devi.txt' eval_parse = 'devr.txt' thresholds = [75] ngram_sizes = [60, 90, 120] for n_ngrams in ngram_sizes: if n_ngrams == 0: feat_name = 'zhang' else: feat_name = 'full' exp_dir = str(base_dir.join('%d_ngrams' % n_ngrams)) #if n_ngrams < 100: # train_n(n, 'unpruned', exp_dir, data, k=k, i=15, t=0, f=0, # train_alg="max", label="Stanford", n_sents=size, feat_str=feat_name) for t in thresholds: thresh = 'thresh%d' % t train_n(n, thresh, exp_dir, data, k=k, i=15, t=t, f=100, train_alg='max', label="Stanford", n_sents=size, feat_str=feat_name, ngrams=n_ngrams)
def bitable(name): exp_dir = REMOTE_PARSERS.join(name) base_accs = get_accs(str(exp_dir.join('0_S0_N0'))) base_acc = sum(base_accs) / len(base_accs) print "Base:", len(base_accs), sum(base_accs) / len(base_accs) results = [] with cd(str(exp_dir)): ngrams = run("ls %s" % exp_dir, quiet=True).split() for ngram in sorted(ngrams): if ngram == 'base' or ngram == '0_S0_N0': continue accs = get_accs(str(exp_dir.join(ngram))) print ngram, len(accs) if not accs: continue _, avg, stdev = _get_stdev(accs) z, p = scipy.stats.wilcoxon(accs, base_accs) parts = ngram.split('_') if ngram.startswith('base'): base_acc = avg else: results.append((avg, ngram, stdev, p)) good_ngrams = [] results.sort() results.reverse() for acc, ngram, stdev, p in results: ngram = '_'.join(ngram.split('_')[1:]) if acc > base_acc and p < 0.01: print r'%s & %.3f & %.3f \\' % (ngram, acc - base_acc, p) good_ngrams.append(ngram) print good_ngrams print len(good_ngrams)