示例#1
0
def test_learn_play(d = 6, num_layers = 2, num_units = 100,
                    eps = 0.5, iters = 10000, draw=False,
                    tabular = True, batch=False, batch_epochs=10,
                    num_episodes = 10, episode_length = 100):
    iters_per_value = 1 if iters <= 10 else int(iters / 10.0)
    scores = []
    def interact(q, iter=0):
        if iter % iters_per_value == 0:
            scores.append((iter, evaluate(game, num_episodes, episode_length,
                                          lambda s: greedy(q, s))[0]))
            print('score', scores[-1], flush=True)
    game = No_Exit(d)
    if tabular:
        q = TabularQ(game.states, game.actions)
    else:
        q = NNQ(game.states, game.actions, game.state2vec, num_layers, num_units,
                epochs=batch_epochs if batch else 1)
    if batch:
        qf = Q_learn_batch(game, q, iters=iters, episode_length = 100, n_episodes=10,
                           interactive_fn=interact)
    else:
        qf = Q_learn(game, q, iters=iters, interactive_fn=interact)
    if scores:
        print('String to upload (incude quotes): "%s"'%toHex(pickle.dumps([tabular, batch, scores], 0).decode()))
        # Plot learning curve
        plot_points(np.array([s[0] for s in scores]),
                    np.array([s[1] for s in scores]))
    for i in range(num_episodes):
        reward, _, animation = sim_episode(game, (episode_length if d > 5 else episode_length/2),
                                lambda s: greedy(qf, s), draw=draw)
        print('Reward', reward)
    return animation
def test_learn_play(game=None, q=None, num_layers=2, num_units=100,
                    eps=0.5, iters=10000, draw=False,
                    tabular=True, batch=False, batch_epochs=10,
                    num_episodes=2, episode_length=500):
    iters_per_value = 1 if iters <= 10 else int(iters / 10.0)
    scores = []

    def interact(q, iter=0):
        if iter % iters_per_value == 0:
            scores.append((iter, evaluate(game, num_episodes, episode_length,
                                          lambda s: greedy(q, s))[0]))
            print('score', scores[-1])

    if not game:
        game = TempSim()

    global r_stvar
    if not q:
        if tabular:
            r_stvar = round
            q = TabularQ(game.states, game.actions)
        else:
            r_stvar = float
            q = NNQ(game.states, game.actions, game.state2vec, num_layers, num_units,
                    epochs=batch_epochs if batch else 1)


    try:
        if batch:
            qf = Q_learn_batch(game, q, iters=iters, episode_length=100, n_episodes=10,
                               eps=eps, interactive_fn=interact)
        else:
            qf = Q_learn(game, q, iters=iters, eps=eps, interactive_fn=interact)
    except KeyboardInterrupt:
        pass

    emulate(game, q, episode_length=episode_length)

    return game, q