def main(): from arsenal.iterview import iterview from time import sleep from numpy.random import uniform T = Benchmark('A vs B') for _ in iterview(range(1000), T.title): with T['A']: sleep(np.random.exponential(.001)) with T['B']: sleep(np.random.exponential(.001)) T.compare() t = Timer('test') for i in iterview(range(1, 20)): for _ in range(10): with t(i=i): c = 0.01 z = max(i**2 * 0.0001 + uniform(-c, c), 0.0) sleep(z) t.plot_feature('i') pl.show()
def contour_plot(f, xdomain, ydomain, color='viridis', alpha=0.5, levels=None, ax=None): "Contour plot of a function of two variables." from arsenal import iterview if ax is None: ax = pl.gca() [xmin, xmax, _] = xdomain [ymin, ymax, _] = ydomain X, Y = np.meshgrid(np.linspace(*xdomain), np.linspace(*ydomain)) Z = np.array([ f(np.array([x, y])) for (x, y) in iterview(zip(X.flat, Y.flat), length=len(X.flat)) ]).reshape(X.shape) contours = ax.contour(X, Y, Z, 20, colors='black', levels=levels) ax.clabel(contours, inline=True, fontsize=8) if color is not None: ax.imshow(Z, extent=[xmin, xmax, ymin, ymax], origin='lower', cmap=color, alpha=alpha) ax.axis(aspect='scalar') ax.figure.tight_layout() ax.set_xlim(xmin, xmax) ax.set_ylim(ymin, ymax)
def build(GRAMMAR, ACC, RUN, INIT): data = [] if INIT == 'policy': csvs = path('results').glob('*/dump/new_policy*.npz.inspect_rollouts.csv.gz') elif INIT == 'init': csvs = path('results').glob('*/dump/init.npz.inspect_rollouts.csv.gz') else: raise ValueError('dont understand INIT=%s' % INIT) for f in iterview(csvs): args = cPickle.load(file((f / '..' / 'args.pkl').abspath())) if args.grammar != GRAMMAR or args.accuracy != ACC or args.runtime != RUN: continue try: breakdown(data, args, pd.read_csv(f)) except pd.io.common.EmptyDataError: print colors.red % '*** skipping empty file %s' % f print df = pd.DataFrame(data) df = df.sort_values('tradeoff') print df return df
def inner_optimization(self, iterations, prox_every=25): budget = self.group_budget for t in range(iterations): print() np.random.shuffle(self.train) for x in iterview(self.train, colors.green % 'Pass %s' % (t+1)): S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense) self.gradient(x.N, x.tags, S) S.backprop() if budget is not None and self.sparse.step % prox_every == 0: self.dense.prox_budget(budget) self.sparse.step += 1 assert np.isfinite(self.sparse.w).all() assert np.isfinite(self.dense.w).all() # make sure to call prox udate before finishing this pass. This will # keep the number of features within the budget. if budget is not None: self.dense.prox_budget(budget) self.after_inner_pass()
def make_instances(self, fold, cls): "Convert tuples in data `fold` to instances of `cls`." data = [] for x in iterview(getattr(self, fold), msg='Features (%s)' % fold): tags, tokens = list(zip(*x)) data.append(cls(tokens, self.Y.map(tags), self)) return data
def fdcheck(func, w, g, keys=None, eps=1e-5): """ Finite-difference check. Returns `arsenal.math.compare` instance. - `func`: zero argument function, which references `w` in caller's scope. - `w`: parameters. - `g`: gradient estimate to compare against - `keys`: dimensions to check - `eps`: perturbation size """ if keys is None: if hasattr(w, 'keys'): keys = w.keys() else: keys = range(len(w)) fd = {} for key in iterview(keys): was = w[key] w[key] = was + eps b = func() w[key] = was - eps a = func() w[key] = was fd[key] = (b - a) / (2 * eps) return compare([fd[k] for k in keys], [g[k] for k in keys])
def test_parse(): from arsenal.cache import memoize from hypergraphs.semirings import LazySort w = memoize(lambda *edge: np.exp(np.random.randn())) sentence = 'Papa ate the caviar with the spoon .'.split() grammar = load_grammar(""" S X . X X X X Papa X ate X the X caviar X with X spoon X in X park """) def distribution(): root = parser(sentence, grammar, w, LazySort) Z = 0.0 p = {} for x in root: p[str(extract2(x.data))] = x.score Z += x.score return normalize(p) p = distribution() # TODO: make the parser return a the root rather than the full chart. q = {x: 0 for x in p} reps = 10_000 def run(): return parser(sentence, grammar, w, Sample) if EAGER: def sampler(): while True: yield run().value else: def sampler(): yield from run() sample = iter(sampler()) #sample = iter(lazy_sampler()) for r in iterview(range(1, 1+reps)): _, s = next(sample) q[str(s)] += 1 if r % 1_000 == 0: print(f'\nerr({r})=', 0.5*sum(abs(p[x] - q[x]/r) for x in p))
def __call__(self, policy, examples, grammar, msg='eval'): "Evaluate test-time pruning policy ``c`` on ``examples``." rs = [] for e in iterview(examples, msg=msg): rs.append(self.parse(e, grammar, policy)) a = AvgReward(rs) if hasattr(a, self.ACCURACY): a.accuracy = getattr(a, self.ACCURACY) else: a.accuracy = a.attrs[self.ACCURACY] a.attrs['accuracy'] = a.accuracy return a
def contour_plot(f, xdomain, ydomain, color='viridis', alpha=0.5, levels=None): "Contour plot of a function of two variables." from arsenal import iterview [xmin, xmax, _] = xdomain; [ymin, ymax, _] = ydomain X, Y = np.meshgrid(np.linspace(*xdomain), np.linspace(*ydomain)) Z = np.array([f(np.array([x,y])) for (x,y) in iterview(zip(X.flat, Y.flat), length=len(X.flat))]).reshape(X.shape) contours = pl.contour(X, Y, Z, 20, colors='black', levels=levels) pl.clabel(contours, inline=True, fontsize=8) if color is not None: pl.imshow(Z, extent=[xmin, xmax, ymin, ymax], origin='lower', cmap=color, alpha=alpha) pl.axis(aspect='scalar') pl.gcf().tight_layout() pl.xlim(xmin,xmax); pl.ylim(ymin,ymax)
def run(self, methods, reps): from arsenal import iterview, restore_random_state if isinstance(methods, (tuple, list)): methods = {m.__name__: m for m in methods} jobs = [ (name, seed) for seed in range(reps) # TODO: use a better strategy for picking random seeds. for name in methods ] np.random.shuffle(jobs) # shuffle jobs to avoid weird ordering correlations for name, seed in iterview(jobs): with restore_random_state(seed): with self[name]: methods[name]()
def main(): from arsenal.iterview import iterview from time import sleep from numpy.random import uniform t = Timer('test') for i in iterview(range(1, 20)): for _ in range(10): with t(i=i): c = 0.01 z = max(i**2 * 0.0001 + uniform(-c, c), 0.0) sleep(z) a = t.plot_feature('i') print(a) pl.show()
def test(): import numpy as np import matplotlib.pyplot as pl from arsenal.timer import timers from arsenal import iterview T = timers() params = [ (i, rep) for i in range(5, 20) for rep in range(5) ] np.random.shuffle(params) for (i, _) in iterview(params): # BY's runtime scales logarithmically with the bigger set and linearly # with the smaller set. Hash-based intersetion scales with their sum. n = 2**i m = 2*10 U = range(max(n,m)*5) A = list(np.random.choice(U, n, replace=0)) B = list(np.random.choice(U, m, replace=0)) A.sort() B.sort() sA = set(A) sB = set(B) with T['set'](i=2**i): E = (sA & sB) with T['make-set'](i=2**i): E = (set(A) & set(B)) with T['B-Y'](i=2**i): C = list(sorted_intersection(A, B)) assert sorted(E) == C T.compare() T.plot_feature('i') pl.show()
def evaluate(self, predict, data, name, verbosity=1): if not data: return if verbosity: print() print('Phrase-based F1:', name) f1 = F1() for i, x in enumerate(iterview(data, msg='Eval %s' % name)): pred = extract_contiguous(predict(x)) gold = extract_contiguous(self.Y.lookup_many(x.tags)) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in gold: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in pred: f1.add_retrieved(label, (i, begins, ends)) if verbosity: print() return f1.scores(verbose=verbosity >= 1)
def evaluate(self, predict, data, msg, verbosity=2): "Run predict `predict` function on data." if not data: return float('nan'), [] ff = F1() correct = Counter() total = Counter() for ii, x in enumerate(iterview(data, colors.blue % 'Eval (%s)' % msg)): y = predict(x) gold = self.Y.lookup_many(x.tags) for t, (got, want) in enumerate(zip(y, gold)): if verbosity >= 2: ff.report(instance=(ii, t), prediction=got, target=want) for c in self.error_classifications(x, t): if got == want: correct[c] += 1 total[c] += 1 #print 'sentences:', len(data), 'tokens:', total['overall'] c = 'overall' acc = '%s: %.2f' % (colors.light_yellow % c, 100 * correct[c] / total[c]) other = total.keys() other.remove(c) breakdown = ', '.join('%s: %.2f' % (c, 100 * correct[c] / total[c]) for c in sorted(other)) print '%s (%s)' % (acc, breakdown) if verbosity >= 2: print print 'F1 breakdown' print '============' ff.scores() return correct['overall'] / total['overall']
def __init__(self, args, setup, policy, output_file, tradeoff, roll_out): self.setup = setup self.tradeoff = tradeoff self.grammar = grammar = setup.grammar self.train = setup.train self.ACCURACY = args.accuracy self.RUNTIME = args.runtime self.policy_name = policy self.output_file = output_file self.nfeatures = nfeatures = setup.nfeatures self.policy = GLM(nfeatures, C=np.nan, loss=0) # dummy self.policy._coef = np.load(policy)['coef'] if roll_out == ROLLOUT.CP: Rollouts = CP elif roll_out == ROLLOUT.BF: Rollouts = BF elif roll_out == ROLLOUT.HY: Rollouts = HY elif roll_out == ROLLOUT.DP: Rollouts = DP else: raise ValueError('Unrecognized rollout option %s' % roll_out) tmp = workaround(output_file, context=self) for e in iterview(self.train, msg='rollouts'): p = Rollouts(grammar, e, self.policy, accuracy=args.accuracy, runtime=args.runtime, tradeoff=tradeoff) p.roll_outs(tmp) tmp.save()
def __init__(self, args, setup, tradeoff, iterations, minibatch, results=None, C=None, roll_out=ROLLOUT.CP, initializer=INIT.BODENSTAB_GOLD, initializer_penalty=0.01, show_reference=0): self.evals = [] self.tradeoff = tradeoff self.grammar = grammar = setup.grammar self.nfeatures = nfeatures = setup.nfeatures self.ACCURACY = args.accuracy self.RUNTIME = args.runtime self.C = C if args.classifier == CLASSIFIER.LOGISTIC: self.policy = GLM(nfeatures, C=self.C, loss=0) elif args.classifier == CLASSIFIER.LINEAR: self.policy = GLM(nfeatures, C=self.C, loss=1) elif args.classifier == CLASSIFIER.ADAGRAD: self.policy = Adagrad(self.nfeatures, C=self.C, loss=0, eta=args.learning_rate) elif args.classifier == CLASSIFIER.ADAGRAD_LINEAR: self.policy = Adagrad(self.nfeatures, C=self.C, loss=1, eta=args.learning_rate) elif args.classifier == CLASSIFIER.ADAGRAD_HINGE: self.policy = Adagrad(self.nfeatures, C=self.C, loss=2, eta=args.learning_rate) elif args.classifier == CLASSIFIER.SVM: self.policy = SVM(self.nfeatures, C=self.C) elif args.classifier == CLASSIFIER.PERCEPTRON: self.policy = Perceptron(self.nfeatures) else: raise AssertionError('Unrecognized classifier option %r' % args.classifier) if args.init_weights is not None: # XXX: Hack to warm start weights print '[init weights]', args.init_weights assert args.init_weights.exists() self.policy._coef = np.load(args.init_weights)['coef'] self.evaluate = Evaluate(args.accuracy, args.runtime) sty = { 'oracle1': dict(c='k', alpha=0.5, linestyle=':'), 'fastmle': dict(c='g', alpha=0.5, linestyle=':'), 'unpruned': dict(c='k', alpha=0.5, linestyle='--'), 'new_policy': dict(c='b', lw=2), } self.lc = ddict(lambda name: viz.LearningCurve(name, sty=sty)) train = list(setup.train) random.shuffle(train) dev = list(setup.dev) dataset = [('train', train)] if dev: dataset.append(('dev', dev)) self.train = train self.dev = dev self.results = results # Do we need to run the unpruned parser? (This can be very slow, so we # should only do it when necessary.) if (show_reference or roll_out == ROLLOUT.BODENSTAB_MLE or initializer == INIT.BODENSTAB_MLE): from ldp.parsing.util import item_tree, item_tree_get_items for e in iterview(train + dev, msg='unpruned'): # unpruned r = self.evaluate.parse(e, grammar, mask=e.mask, with_derivations=1) e.mle_spans = frozenset({ (I, K) for (_, I, K) in item_tree_get_items(item_tree(r.coarse)) if K - I > 1 and K - I != e.N }) del r.coarse, r.derivation # delete to save memory e.baseline = r # Do we need to run the oracle parser? if show_reference: for e in iterview(train + dev, msg='oracle'): # oracle m = e.mask for x in e.nodes: m[x] = (x in e.gold_spans) r = self.evaluate.parse(e, grammar, mask=m, with_derivations=1) del r.coarse, r.derivation # delete to save memory e.oracle = r # plot/log baselines self.baselines() # ---------------------------------- # Baseline -- # Assumes no dynamic features. if roll_out in {ROLLOUT.BODENSTAB_GOLD, ROLLOUT.BODENSTAB_MLE}: for e in iterview(train, msg='rollouts'): p = BodenstabParser(grammar, e, target=roll_out, tradeoff=tradeoff) p.roll_outs() print colors.yellow % 'Training...' with timeit('train'): self.policy.train(train) ps = dict(new_policy=self.policy) x = dict(iteration=1, tradeoff=tradeoff) self.iteration = 1 self.performance(dataset, ps, x) return # ---------------------------------- if roll_out == ROLLOUT.CP: Rollouts = CP elif roll_out == ROLLOUT.BF: Rollouts = BF elif roll_out == ROLLOUT.HY: Rollouts = HY elif roll_out == ROLLOUT.DP: Rollouts = DP else: raise ValueError('Unrecognized rollout option %s' % roll_out) for iteration in xrange(1, iterations + 1): self.iteration = iteration print print colors.green % 'Iter %s' % iteration if iteration == 1 and initializer in [ ROLLOUT.BODENSTAB_MLE, ROLLOUT.BODENSTAB_GOLD ]: # first iteration uses asymmetric classification to initialize for e in iterview(train, msg='rollouts'): p = BodenstabParser(grammar, e, target=initializer, tradeoff=initializer_penalty) p.roll_outs() else: M = choice(train, min(len(train), minibatch), replace=0) tmp = [] for e in iterview(M, msg='rollouts'): p = Rollouts(grammar, e, self.policy, accuracy=args.accuracy, runtime=args.runtime, tradeoff=tradeoff) p.roll_outs(tmp) # corpus-level accuracy if args.accuracy in (ACC.EVALB_corpus, ACC.EXPECTED_RECALL_corpus): self.postprocess_rollouts_corpus(M, tmp) else: # Compare baseline labels to LOLS's "labels" via rollouts. self.asym_v_lols(tmp) # propagate back to CSC datasets for [e, (I, K), w, (action0, r0), (action1, r1)] in tmp: if args.accuracy == ACC.EVALB_avg: acc1 = r1.f1() acc0 = r0.f1() elif args.accuracy == ACC.EXPECTED_RECALL_avg: acc1 = r1.recall() acc0 = r0.recall() else: acc1 = r1.accuracy acc0 = r0.accuracy e.Q[I, K, action1] += w * (acc1 - tradeoff * r1.runtime) e.Q[I, K, action0] += w * (acc0 - tradeoff * r0.runtime) print colors.yellow % 'Training...' with timeit('train'): self.policy.train(train) # Specify additional policies to evaluate on training data. ps = dict(new_policy=self.policy) # metadata to log x = dict(iteration=iteration, tradeoff=tradeoff) self.performance(dataset, ps, x)
def get_data(G, policy_name, w, examples, verbose=0): data = [] for eid, e in enumerate(iterview(examples, msg='evalb')): if verbose: print print print colors.yellow % e.sentence F = Features(G, nfeatures=2**22) words = e.sentence.split() if 'unpruned' in policy_name: # <TIMING BLOCK> mask = e.mask b1 = time() e.tokens = np.asarray(G.encode_sentence(words)) b2 = time() state = pruned_parser(e.tokens, G, mask) b3 = time() # </TIMING BLOCK> coarse = G.coarse_derivation(state.derivation) else: # <TIMING BLOCK> b1 = time() e.tokens = np.asarray(G.encode_sentence(words)) mask = F.mask(e, w) b2 = time() state = pruned_parser(e.tokens, G, mask) b3 = time() # </TIMING BLOCK> coarse = G.coarse_derivation(state.derivation) nodes = e.nodes mask_size = sum(mask[x] for x in nodes) keep_rate = mask_size / len(nodes) if len(nodes) > 0 else 0 want_and_got, got, want = e.evalb_unofficial(coarse) evalb_avg, _, recall_avg = fpr(want_and_got, got, want) if isinstance(coarse, Tree): parse = oneline(coarse) fail = 0 else: parse = '(FAIL %s)' % ' '.join('(X %s)' % x for x in e.sentence.split()) fail = 1 data.append({ 'example': e, 'eid': eid, 'N': e.N, 'fail': fail, 'mask': mask_size, 'keep_rate': keep_rate, 'parse': parse, 'policy': policy_name, 'time_total': b3 - b1, 'time_feature': b2 - b1, 'time_parse': b3 - b2, 'evalb_avg': evalb_avg, 'recall_avg': recall_avg, 'want_and_got': want_and_got, 'want': want, 'got': got, 'pushes': state.pushes, 'pops': state.pops }) return data
def aggregate_multiple_runtime_trials(Ds, Ps): """Collapse multiple dataframes `Ds` from different timing runes into a single one, by taking the min over runtimes (i.e., new runtime will be "best-of k" where k=|Ds|). Actually, this function does more than that. It appears to collapse over sentence too, e.g., computing corpus-EVALB and avg[best-of-k runtimes]. """ D0 = Ds[0] # Append trials together foo = Ds[0] for dd in Ds[1:]: foo = foo.append(dd) # Take min over time_total for this policy-example pair. minz = foo[['policy','example','time_total']].groupby(['policy','example']).min() data = [] for policy in iterview(Ps): dump = path(policy).dirname() args = cPickle.load(file(dump / 'args.pkl')) log = pd.read_csv(dump / 'log.csv') # TODO: will need to add extra cases. if 'DP' in args.roll_out: type_ = 'DP' elif 'CP' in args.roll_out: type_ = 'CP' elif 'HY' in args.roll_out: type_ = 'HY' elif 'BODEN' in args.roll_out: type_ = 'baseline' else: raise ValueError(args.roll_out) min_times = minz.ix[policy]['time_total'] P = D0[D0.policy == policy] f = cgw_f(P.want_and_got.sum(), P.got.sum(), P.want.sum()) #pl.scatter(df.avg_bestof_time, df.evalb, c=C[name], lw=0) #show_frontier(df.avg_bestof_time, df.evalb, c=C[name], interpolation='linear', label=name) #[w,b] = np.polyfit(df.pushes, df.avg_bestof_time, deg=1) #show_frontier(df.pushes*w + b, df.evalb, interpolation='linear', c=C[name]) if 0: # log-log plot of pushes v. seconds. Really great correlation! PP = P[['example','pushes']].join(min_times, on='example') PP['log(pushes)'] = np.log(PP.pushes) PP['log(seconds)'] = np.log(PP.time_total) compare('log(pushes)', 'log(seconds)', data=PP, scatter=1, show_regression=1) #pl.figure() # pushes v. seconds. Really great correlation! #PP = P[['example','pushes']].join(min_times, on='example') #compare('pushes', 'time_total', data=PP, scatter=1, show_regression=1) pl.ioff(); pl.show() if 0: # empirical runtime estimates # scatter plot sentence length against runtime. n_by_time = P[['example','N']].join(min_times, on='example') pl.scatter(n_by_time.N, n_by_time.time_total, alpha=0.5, lw=0) # highlight median runtime per sentence length. n_by_median_time = n_by_time.groupby('N').median() pl.plot(n_by_median_time.index, n_by_median_time.time_total, c='k', lw=2) # empirical exponent and constant factor compare(np.log(n_by_time.time_total), np.log(n_by_time.N), scatter=1, show_regression=1) pl.ioff(); pl.show() # use early stopping on dev to pick the policy. dev = log.ix[log['dev_new_policy_reward'].argmax()] row = {'avg_bestof_time': np.mean(min_times), 'wps': np.mean(P.N) / np.mean(min_times), 'pushes': np.mean(P.pushes), 'pops': np.mean(P.pops), 'policy': policy, 'dev_pushes': dev.dev_new_policy_pushes, 'dev_evalb': dev.dev_new_policy_evalb_corpus, 'type': type_, 'evalb': f} row.update({'args_'+k: v for k,v in args.__dict__.items()}) data.append(row) # remove unused baselines (sorry this is a bit ugly). ddd = pd.DataFrame(data) others = ddd[ddd.type != 'baseline'] B = ddd[ddd.type == 'baseline'] used = set() for _, z in others.iterrows(): [ix] = B[B.policy == z.args_init_weights].index used.add(ix) B = B.ix[list(used)] ddd = others.append(B) return ddd
def main(): pl.ion() p = ArgumentParser() p.add_argument('root', type=path) p.add_argument('--quick', action='store_true', help="Load a single evaluation log (for quick tests). Won't run bestof-k runtime.") p.add_argument('-i', action='store_true', help='Interactive mode => open an IPython shell after execution.') args = p.parse_args() runs = [r for r in sorted(args.root.glob('*')) if r.isdir()] if args.quick: print colors.bold % colors.red % 'Warning! only using some of the runs for timing information.' runs = runs[:1] Ds = [(r, load(r)) for r in iterview(runs)] D0, Ds, bestof = sanity_check(Ds) # if 0: # pl.figure() # for name, df in D0.groupby('type'): # pl.scatter(df.avg_bestof_time, df.evalb, c=C[name], lw=0) # show_frontier(df.avg_bestof_time, df.evalb, c=C[name], interpolation='linear-convex', label=name) # #[w,b] = np.polyfit(df.pushes, df.avg_bestof_time, deg=1) # #show_frontier(df.pushes*w + b, df.evalb, interpolation='linear', c=C[name]) # pl.xlabel('sec/sentence (best of %s)' % len(Ds)) # pl.ylabel('Corpus EVALB-F1') # pl.legend(loc=4) # pl.show() rescale = 1/bestof.pushes.max() bestof['pushes_r'] = bestof.pushes*rescale B = bestof[bestof.type=='baseline'].copy() lols = bestof[bestof.type!='baseline'] RO_types = lols.args_roll_out.unique() ax = pl.figure().add_subplot(111) for name, df in reversed(sorted(bestof.groupby('type'))): pl.scatter(df.pushes_r, df.evalb, c=C[name], lw=0, zorder=10, label='', s=50) pts = show_frontier(df.pushes_r, df.evalb, interpolation='linear-convex', lw=2, c=C[name], label=name) ax.plot(pts[:,0], pts[:,1], label=name, c=C[name]) pl.ylabel('Corpus $F_1$') pl.legend(loc=4) pl.tight_layout() ax = pl.gca() conesize = .06 lambda_cone(np.array(B.evalb), np.array(B.pushes_r), ax=ax, c=c_baseline, conesize=conesize, lines=0) # -------------------------------------------------------------------------- # Fit parametric curve to dev points show arrows on test points. from ldp.viz.parametric_fit import fit df = join_with_dev(B) ff, gg = fit(df.dev_pushes, df.dev_evalb) if 0: # enable to show the parametric curve. xs = pl.linspace(0, df.dev_pushes.max()+.1*df.dev_pushes.ptp(), 100) ax.plot(xs*rescale, ff(xs), c='k') ax = pl.gca() for _, z in df.iterrows(): x, y = z.test_pushes*rescale, z.test_evalb arrow(x, y, gg(z.dev_pushes)/rescale, offset=-conesize, c=c_vec_baseline, ax=ax) # -------------------------------------------------------------------------- B.loc[:,'tradeoff'] = np.nan data = [] # Loop over all rollout types joined on initial policy (i.e., the baseline). for i, bl in B.iterrows(): spawn = lols[lols.args_initializer_penalty == bl.args_tradeoff] assert len(spawn) == len(RO_types) models = {} for ro in RO_types: [ix] = spawn[spawn.args_roll_out == ro].index models[ro] = lols.ix[ix] [dev_pushes] = df[df.policy == bl.policy].dev_pushes tradeoff = gg(dev_pushes) B.loc[i,'tradeoff'] = tradeoff if 1: # uncomment to run hypothesis tests. print colors.bold % colors.green % '=============================================================' print 'tradeoff: %g' % tradeoff print baseline_acc, baseline_run = get_acc_run(D0[D0.policy == bl.policy]) row = { 'baseline': bl.policy, 'baseline_accuracy': baseline_acc, 'baseline_runtime': baseline_run, 'baseline_reward': baseline_acc - tradeoff*baseline_run, 'wps_baseline': bl.wps, 'wallclock_baseline': bl.avg_bestof_time, 'tradeoff': tradeoff, } star_sty = dict(alpha=1, lw=0, marker='*', s=700, zorder=100) for ro, model in sorted(models.items()): print colors.bold % '# %s' % ro sig, win = paired_permutation_test(D0, a=bl.policy, b=model.policy, tradeoff=tradeoff, R=5000) acc, run = get_acc_run(D0[D0.policy == model.policy]) row[ro] = model.policy row['%s_accuracy' % ro] = acc row['%s_runtime' % ro] = run row['%s_reward' % ro] = acc - tradeoff*run row['wps_%s' % ro] = model.wps row['wallclock_%s' % ro] = model.avg_bestof_time row['winner_%s' % ro] = win row['sig_%s' % ro] = sig if win == +1: pl.scatter([model.pushes_r], [model.evalb], c=C[ro], **star_sty) elif win == -1: pl.scatter([bl.pushes_r], [bl.evalb], c=C['baseline'], **star_sty) # draw a dotten line to the baseline point. pl.plot([bl.pushes_r, model.pushes_r], [bl.evalb, model.evalb], c=C[ro], lw=1, alpha=0.75, label=None, linestyle='--') data.append(row) #[w,b] = np.polyfit(B.pushes, B.avg_bestof_time, deg=1) xx = lols.pushes_r xx = np.linspace(xx.min(), xx.max(), 12) # put ticks on the top of the plot. #ax.xaxis.tick_top() #pl.xticks(xx, ['%.2g\m(%.2g)' % (x/rescale / 1e6, (x/rescale*w+b)*100) for x in xx], rotation=0) #pl.text(0.4, 0.401, re.sub('(\d)e([\-+]\d+)', r'\1e^{\2}', r'$\textit{seconds} \approx %.2g \cdot \textit{pushes} + %.2g$' % (w,b))) #pl.xlabel('average megapushes ($\\approx$ milliseconds)') pl.xticks(xx, [r'$%.2g$' % (x/rescale / 1e6) for x in xx]) # pl.xticks(xx, [r'%.2g' % (x/rescale / 1e6) for x in xx], rotation=45) if 'medium' not in args.root: pl.xlabel('millions of hyperedges built per sentence') pl.ylim(bestof.evalb.min()-0.02, bestof.evalb.max()+0.015) pl.xlim(bestof.pushes_r.min()-.01, bestof.pushes_r.max()+0.01) zf = pd.DataFrame(data).sort_values('tradeoff') # print zf[['tradeoff', 'baseline_reward', 'cp_reward', 'dp_reward', 'winner_cp', 'winner_dp']].sort_values('tradeoff').to_string(float_format='%.4g'.__mod__, index=0) # print zf[['tradeoff', # 'baseline_accuracy', 'baseline_runtime', # 'cp_accuracy', 'cp_runtime', # 'dp_accuracy', 'dp_runtime', # 'winner_cp', 'winner_dp']].sort_values('tradeoff').to_string(float_format='%.4g'.__mod__, index=0) if not args.quick: sig_file = args.root / 'significance.csv' print print colors.green % 'wrote %s' % sig_file print zf.to_csv(sig_file) if args.i: pl.ion(); pl.show() from arsenal.debug import ip; ip() else: pl.ioff(); pl.show()