def random_contexts(sigma, depth, size): """Generate random contexts, such that `|W|=size` (before closure) of |w|=`depth` for w in W, from a |alphabet|=sigma. """ sigma = range(sigma) possible = list(xprod(*(sigma, ) * depth)) np.random.shuffle(possible) return list(prefix_closure(possible[:size]))
def update(self, sigma, C): if self.no_failure_arcs: C = set(prefix_closure(C)) C.update((a,) for a in sigma) b4 = len(C) C = set(last_char_sub_closure(sigma, C)) C.add(()) print('[last-char closure] before: %s, after: %s' % (b4, len(C))) return VoCRF.update(self, sigma, C)
def __init__(self, corpus, Y, train, dev, initial_contexts, outer_iterations, inner_iterations, group_budget, regularizer, allowed_contexts, dump, no_failure_arcs=0): self.no_failure_arcs = no_failure_arcs # if true, runs model with last-char subst closure. # Create initial pattern set. VoCRF.__init__(self, Y, initial_contexts) self.dump = None if dump is not None: self.dump = Path(dump) mkdir(self.dump) self.corpus = corpus self.dev_best = -np.inf # the set of allowed contexts must be prefix closed to make sense. self.allowed_contexts = None if allowed_contexts is not None: self.allowed_contexts = set(prefix_closure(allowed_contexts)) self.train = train self.dev = dev # max number of higher-order features = # budget [green nodes - the max number of 'active' contexts at any time] # x extensions = |Y| [yellow nodes - a little room to grow] # x number of labels [because that's how we encode features] XXX: I think this is an overestimate we want |states| x |labels| self.H = max(group_budget * len(Y), len(self.C)) * self.A self.D = MAGIC * self.A self.group_budget = group_budget self.regularizer = regularizer / len(self.train) L = 2 if regularizer > 0 else -1 self.sparse = LazyRegularizedAdagrad(self.D, L=L, C=self.regularizer) self.dense = OnlineProx(self.group_structure(), self.H, L=L, C=self.regularizer) self.inner_iterations = inner_iterations self.outer_iterations = outer_iterations self.log = []
def main(): p = ArgumentParser() p.add_argument('--initial-order', type=int, default=1) p.add_argument('--max-order', type=int) p.add_argument('--inner-iterations', type=int, required=True) p.add_argument('--outer-iterations', type=int, required=True) p.add_argument('--C', type=float, required=True) p.add_argument('--budget', type=int, required=True) p.add_argument('--quick', action='store_true') args = p.parse_args() corpus = CoraCitations('data/cora.txt') if args.quick: corpus.train = corpus.train[:100] corpus.dev = [] allowed_contexts = None if args.max_order is not None: allowed_contexts = prefix_closure( fixed_order_contexts(corpus.Y, order=args.max_order)) print('allowed_contexts:', len(allowed_contexts)) A = ActiveSet(corpus, Y=corpus.Y, train=corpus.train, dev=corpus.dev, group_budget=args.budget, regularizer=args.C, outer_iterations=args.outer_iterations, inner_iterations=args.inner_iterations, initial_contexts=fixed_order_contexts( corpus.Y, args.initial_order), allowed_contexts=allowed_contexts) A.active_set()
def _main(args): with timeit('load data'): corpus = CoNLL_U('data/UD/{lang}/UD_{lang}'.format(lang=args.lang), tag_type=args.tag_type) if args.quick: corpus.train = corpus.train[:100] corpus.dev = corpus.train[:0] allowed_contexts = None if args.context_count is not None: print 'context count filter threshold %s' % args.context_count max_order = args.initial_order + args.outer_iterations, if args.max_order is not None: max_order = args.max_order allowed_contexts = contexts_by_count(corpus, max_order, args.context_count) print 'allowed_contexts:', len(allowed_contexts) B = groupby2(allowed_contexts, len) print '(sizes %s)' % (', '.join('%s: %s' % (z, len(B[z])) for z in sorted(B))) if 0: # things that survived the threshold. for k, v in B.items(): if k >= 10: # context size >= 10 print print k for vv in v: print '-'.join(vv) pl.plot(B.keys(), map(len, B.values())) pl.show() if 0: max_order = args.outer_iterations C = {} for n in xrange(1, max_order + 1): # initial order + num iters C.update(corpus.tag_ngram_counts(n=n)) pl.scatter(map(len, C.keys()), C.values(), lw=0, alpha=0.5) pl.show() elif args.max_order is not None: allowed_contexts = prefix_closure( fixed_order_contexts(corpus.Y, order=args.max_order)) print 'allowed_contexts:', len(allowed_contexts) A = ActiveSet(corpus, Y=corpus.Y, train=corpus.make_instances('train', Instance), dev=corpus.make_instances('dev', Instance), group_budget=args.budget, regularizer=args.C, outer_iterations=args.outer_iterations, inner_iterations=args.inner_iterations, initial_contexts=fixed_order_contexts( corpus.Y, args.initial_order), allowed_contexts=allowed_contexts, no_failure_arcs=args.baseline, dump=args.dump) A.active_set()
def ideal_runtime(self, w): "Evaluate the ideal runtime penalty." return len( prefix_closure( {c for c in self.C if w[self.context_feature_id(c)] != 0}))