def test_load(): from arsenal.timer import timeit with timeit('load 10 sentences maxlength 8'): assert len(list(ptb('train', n=10, maxlength=8))) == 10 d = PTB.standard_split(PTB_ROOT) # skip training because it's slow to load all of it. #with timeit('loading train'): # train = list(d.load_fold(d.train)) with timeit('load dev'): dev = list(d.load_fold(d.dev)) with timeit('load test'): test = list(d.load_fold(d.test)) print 'n_sentences: dev %s' % len(dev) print 'n_sentences: test %s' % len(test) assert len(dev) == 1700 assert len(test) == 2416 with timeit('load dev w/ preprocessing'): dev = list(ptb('dev')) assert len(dev) == 1700 with timeit('load test w/ preprocessing'): test = list(ptb('test')) assert len(test) == 2416 from ldp.prune.example import Setup s = Setup(grammar='medium', train=0, dev=3000, minlength=0, maxlength=1000000) assert len(s.dev) == 1700 #s = Setup(grammar='medium', train=0, dev=0, minlength=0, maxlength=1000000) assert len(list(s.load('test'))) == 2416
def load(filename, default=None, saveit=False, verbose=False): "Load cached item by `name`, on miss call `get` function and cached results." f = path(filename) if f.exists(): if verbose: print '[load] %s, size = %s' % (f, filesize(f)) with timeit('[load] %s' % filename): with file(f) as pkl: return cPickle.load(pkl) else: with file(f) as pkl: return cPickle.load(pkl) else: if default is None: raise OSError("File not found '%s'" % filename) with timeit('[load] make %s' % filename): val = default() if saveit: save(filename, val, verbose=verbose) return val
def save(filename, val, verbose=False): "Save `val` so we can load it via `load`." if verbose: with timeit('[save] %s' % filename): with file(filename, 'wb') as pkl: cPickle.dump(val, pkl) print '[save] %s, size = %s' % (filename, filesize(filename)) else: with file(filename, 'wb') as pkl: cPickle.dump(val, pkl) return val
def _main(args): with timeit('load data'): corpus = CoNLL_U('data/UD/{lang}/UD_{lang}'.format(lang=args.lang), tag_type=args.tag_type) if args.quick: corpus.train = corpus.train[:100] corpus.dev = corpus.train[:0] allowed_contexts = None if args.context_count is not None: print 'context count filter threshold %s' % args.context_count max_order = args.initial_order + args.outer_iterations, if args.max_order is not None: max_order = args.max_order allowed_contexts = contexts_by_count(corpus, max_order, args.context_count) print 'allowed_contexts:', len(allowed_contexts) B = groupby2(allowed_contexts, len) print '(sizes %s)' % (', '.join('%s: %s' % (z, len(B[z])) for z in sorted(B))) if 0: # things that survived the threshold. for k, v in B.items(): if k >= 10: # context size >= 10 print print k for vv in v: print '-'.join(vv) pl.plot(B.keys(), map(len, B.values())) pl.show() if 0: max_order = args.outer_iterations C = {} for n in xrange(1, max_order + 1): # initial order + num iters C.update(corpus.tag_ngram_counts(n=n)) pl.scatter(map(len, C.keys()), C.values(), lw=0, alpha=0.5) pl.show() elif args.max_order is not None: allowed_contexts = prefix_closure( fixed_order_contexts(corpus.Y, order=args.max_order)) print 'allowed_contexts:', len(allowed_contexts) A = ActiveSet(corpus, Y=corpus.Y, train=corpus.make_instances('train', Instance), dev=corpus.make_instances('dev', Instance), group_budget=args.budget, regularizer=args.C, outer_iterations=args.outer_iterations, inner_iterations=args.inner_iterations, initial_contexts=fixed_order_contexts( corpus.Y, args.initial_order), allowed_contexts=allowed_contexts, no_failure_arcs=args.baseline, dump=args.dump) A.active_set()