def test_mccfr_goofspiel3(): g = Goofspiel(3, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=51) mc.compute(600, burn=0.5) mcs = mc.strategies us = UniformStrategy() s1 = g.play_sequence([2]) assert mcs[0].strategy(s1) == pytest.approx([0., 0.9, 0.], abs=0.1) assert sample_payoff(g, mcs, 300, seed=12)[0] == pytest.approx([0.0, 0.0], abs=0.1) assert sample_payoff(g, (mcs[0], us), 300, seed=13)[0] == pytest.approx([1.2, -1.2], abs=0.2) assert exploitability(g, 0, mcs[0]) < 0.1 assert exploitability(g, 1, mcs[1]) < 0.1
def compute_mccfr_traces(g, prefix, n_traces, iters, steps, depth=6, burn=None, burn_from=0, add_uniform=True, exploit_every=None, eploit_max_nodes=1e6): """ Computes independent strategy traces of MCCFR in game `g`. """ traces = [] for ti in tqdm.trange(n_traces, desc=prefix): name = "MCCFR run #{}".format(ti) if burn and ti >= burn_from: name += " (burn-in)" mc = OutcomeMCCFR(g, seed=hash(str(g)) % 2**30 + ti) ps = StrategyTrace(g, depth=depth, name=name) for i in tqdm.trange(steps, desc="MCCFR steps"): w = 1.0 if burn and ti >= burn_from and i < steps * burn: w = 0.03**(1.0 - float(i) / steps / burn) mc.compute(int(iters * (i + 1) / steps) - mc.iterations, progress=False, weight=w) exps = None if exploit_every is not None and (steps - i - 1) % exploit_every == 0: exps = [ exploitability(g, p, mc.strategies[p], max_nodes=eploit_max_nodes) for p in range(g.players) ] ps.append(mc.iterations, mc.strategies, exps) traces.append(ps) if add_uniform: rps = StrategyTrace(g, depth=depth, name="Uniform") rstrat = [UniformStrategy()] * g.players rexps = None if exploit_every is not None: rexps = [ exploitability(g, p, rstrat[p], max_nodes=eploit_max_nodes) for p in range(g.players) ] for t in traces[0].d_t: rps.append(t, rstrat, rexps) traces.append(rps) return traces
def test_mccfr_goofspiel4(): g = Goofspiel(4, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=49) mc.compute(10000, burn=0.5) mcs = mc.strategies for p in [0, 1]: exp = exploitability(g, p, mcs[p]) aexp = approx_exploitability(g, p, mcs[p], 10000, seed=31 + p) print(p, exp, aexp) assert exp == pytest.approx(0.7, abs=0.2) assert aexp == pytest.approx(0.7, abs=0.2)
def main(): N = 4 g = Goofspiel(N, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=56) its = 100.0 while its < 1000000: fname = "goof-{}-{}.strat".format(N, its) mc.persist(fname, iterations=iterations=int(its) - mc.iterations) its *= 2 ** 0.5 print("Exploitability after {:7d} turns (mc, g): {}, {}".format( int(its), exploitability(g, 0, mc), exploitability(g, 1, mc))) assert 0 vs = GoofSpielCardsValueStore(g) vl = SparseStochasticValueLearning(g, vs, seed=41) vals = np.concatenate([ vl.compute([mc, mc], 1000, alpha=0.01, store_step=1), vl.compute([mc, mc], 1000, alpha=0.001, store_step=1), vl.compute([mc, mc], 1000, alpha=0.0001, store_step=1), ], axis=0) plt.plot(vals) plt.show() print("Values:", vs.values) g2 = Goofspiel(N, scoring=Goofspiel.Scoring.ZEROSUM, rewards=vs.values) mc2 = OutcomeMCCFR(g2, seed=57) mc2.compute(iterations=ITERS) print("Exp(mc2, g2)", exploitability(g2, 0, mc2), exploitability(g2, 1, mc2)) print("Exp(mc2, g)", exploitability(g, 0, mc2), exploitability(g, 1, mc2))
def main(): N = 4 ITERS = 2000000 g = Goofspiel(N, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=56) fname = "goof-{}".format(N) its = 1024 while its < ITERS: cached = mc.persist(fname, iterations=its) if not cached: print("Exploitability after {:7d} turns (mc, g): {}, {}".format( its, exploitability(g, 0, mc), exploitability(g, 1, mc))) its *= 2 infosampler = InformationSetSampler(g, mc) vsts = (1, 3) gsts = (1, 3) ax0 = plt.subplot(len(vsts), len(gsts), 1) for i, (vst, gst) in enumerate(itertools.product(vsts, gsts)): vs = LinearValueStore(goofspiel_feaures_cards(g.initial_state()), fix_mean=(N + 1) / 2.0) vl = SparseSGDLinearValueLearning(g, goofspiel_feaures_cards, vs, infosampler, seed=44) vals = np.concatenate([ vl.compute([mc, mc], 1000, step=s, record_every=1, val_samples=vst, grad_samples=gst) for s in [2**-8, 2**-9, 2**-10, 2**-11] ], axis=0) #c = ['red', 'green', 'blue', 'black'][i] ax = plt.subplot(len(vsts), len(gsts), i + 1, sharex=ax0, sharey=ax0) ax.plot(vals) ax.legend(list(range(1, N + 1))) ax.set_title("valseps={} gradsteps={}".format(vst, gst)) print("Done sampling valseps={} gradsteps={}".format(vst, gst)) print("Values:", vs.values) plt.show() return g2 = Goofspiel(N, scoring=Goofspiel.Scoring.ZEROSUM, rewards=vs.values) mc2 = OutcomeMCCFR(g2, seed=57) mc2.compute(iterations=ITERS) print("Exp(mc2, g2)", exploitability(g2, 0, mc2), exploitability(g2, 1, mc2)) print("Exp(mc2, g)", exploitability(g, 0, mc2), exploitability(g, 1, mc2))
def test_parse_gambit_strategy_g3(): g = Goofspiel(3, scoring=Goofspiel.Scoring.ZEROSUM) txt = "NE,1,0,0,1,0,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,0,0,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1" strats = parse_strategy(g, txt) assert exploitability(g, 0, strats[0]) < 1e-6 assert exploitability(g, 0, strats[1]) < 1e-6