Пример #1
0
def random_contexts(sigma, depth, size):
    """Generate random contexts, such that `|W|=size` (before closure) of
    |w|=`depth` for w in W, from a |alphabet|=sigma.

    """
    sigma = range(sigma)
    possible = list(xprod(*(sigma, ) * depth))
    np.random.shuffle(possible)
    return list(prefix_closure(possible[:size]))
Пример #2
0
 def update(self, sigma, C):
     if self.no_failure_arcs:
         C = set(prefix_closure(C))
         C.update((a,) for a in sigma)
         b4 = len(C)
         C = set(last_char_sub_closure(sigma, C))
         C.add(())
         print('[last-char closure] before: %s, after: %s' % (b4, len(C)))
     return VoCRF.update(self, sigma, C)
Пример #3
0
    def __init__(self,
                 corpus,
                 Y,
                 train,
                 dev,
                 initial_contexts,
                 outer_iterations,
                 inner_iterations,
                 group_budget,
                 regularizer,
                 allowed_contexts,
                 dump,
                 no_failure_arcs=0):

        self.no_failure_arcs = no_failure_arcs  # if true, runs model with last-char subst closure.

        # Create initial pattern set.
        VoCRF.__init__(self, Y, initial_contexts)

        self.dump = None
        if dump is not None:
            self.dump = Path(dump)
            mkdir(self.dump)

        self.corpus = corpus
        self.dev_best = -np.inf

        # the set of allowed contexts must be prefix closed to make sense.
        self.allowed_contexts = None
        if allowed_contexts is not None:
            self.allowed_contexts = set(prefix_closure(allowed_contexts))

        self.train = train
        self.dev = dev

        # max number of higher-order features =
        #              budget        [green nodes - the max number of 'active' contexts at any time]
        #  x       extensions = |Y|  [yellow nodes - a little room to grow]
        #  x number of labels        [because that's how we encode features]   XXX: I think this is an overestimate we want |states| x |labels|
        self.H = max(group_budget * len(Y), len(self.C)) * self.A
        self.D = MAGIC * self.A

        self.group_budget = group_budget
        self.regularizer = regularizer / len(self.train)

        L = 2 if regularizer > 0 else -1
        self.sparse = LazyRegularizedAdagrad(self.D, L=L, C=self.regularizer)
        self.dense = OnlineProx(self.group_structure(),
                                self.H,
                                L=L,
                                C=self.regularizer)

        self.inner_iterations = inner_iterations
        self.outer_iterations = outer_iterations

        self.log = []
Пример #4
0
def main():
    p = ArgumentParser()
    p.add_argument('--initial-order', type=int, default=1)
    p.add_argument('--max-order', type=int)
    p.add_argument('--inner-iterations', type=int, required=True)
    p.add_argument('--outer-iterations', type=int, required=True)
    p.add_argument('--C', type=float, required=True)
    p.add_argument('--budget', type=int, required=True)
    p.add_argument('--quick', action='store_true')

    args = p.parse_args()

    corpus = CoraCitations('data/cora.txt')

    if args.quick:
        corpus.train = corpus.train[:100]
        corpus.dev = []

    allowed_contexts = None
    if args.max_order is not None:
        allowed_contexts = prefix_closure(
            fixed_order_contexts(corpus.Y, order=args.max_order))
        print('allowed_contexts:', len(allowed_contexts))

    A = ActiveSet(corpus,
                  Y=corpus.Y,
                  train=corpus.train,
                  dev=corpus.dev,
                  group_budget=args.budget,
                  regularizer=args.C,
                  outer_iterations=args.outer_iterations,
                  inner_iterations=args.inner_iterations,
                  initial_contexts=fixed_order_contexts(
                      corpus.Y, args.initial_order),
                  allowed_contexts=allowed_contexts)

    A.active_set()
Пример #5
0
def _main(args):
    with timeit('load data'):
        corpus = CoNLL_U('data/UD/{lang}/UD_{lang}'.format(lang=args.lang),
                         tag_type=args.tag_type)

    if args.quick:
        corpus.train = corpus.train[:100]
        corpus.dev = corpus.train[:0]

    allowed_contexts = None
    if args.context_count is not None:
        print 'context count filter threshold %s' % args.context_count

        max_order = args.initial_order + args.outer_iterations,
        if args.max_order is not None:
            max_order = args.max_order

        allowed_contexts = contexts_by_count(corpus, max_order,
                                             args.context_count)
        print 'allowed_contexts:', len(allowed_contexts)

        B = groupby2(allowed_contexts, len)
        print '(sizes %s)' % (', '.join('%s: %s' % (z, len(B[z]))
                                        for z in sorted(B)))

        if 0:
            # things that survived the threshold.
            for k, v in B.items():
                if k >= 10:  # context size >= 10
                    print
                    print k
                    for vv in v:
                        print '-'.join(vv)
            pl.plot(B.keys(), map(len, B.values()))
            pl.show()

        if 0:
            max_order = args.outer_iterations
            C = {}
            for n in xrange(1, max_order + 1):  # initial order + num iters
                C.update(corpus.tag_ngram_counts(n=n))
            pl.scatter(map(len, C.keys()), C.values(), lw=0, alpha=0.5)
            pl.show()

    elif args.max_order is not None:
        allowed_contexts = prefix_closure(
            fixed_order_contexts(corpus.Y, order=args.max_order))
        print 'allowed_contexts:', len(allowed_contexts)

    A = ActiveSet(corpus,
                  Y=corpus.Y,
                  train=corpus.make_instances('train', Instance),
                  dev=corpus.make_instances('dev', Instance),
                  group_budget=args.budget,
                  regularizer=args.C,
                  outer_iterations=args.outer_iterations,
                  inner_iterations=args.inner_iterations,
                  initial_contexts=fixed_order_contexts(
                      corpus.Y, args.initial_order),
                  allowed_contexts=allowed_contexts,
                  no_failure_arcs=args.baseline,
                  dump=args.dump)

    A.active_set()
Пример #6
0
 def ideal_runtime(self, w):
     "Evaluate the ideal runtime penalty."
     return len(
         prefix_closure(
             {c
              for c in self.C if w[self.context_feature_id(c)] != 0}))