Пример #1
0
def main():
    NAME = "01_baseline"

    random.seed(common.SEED)
    torch.manual_seed(common.SEED)
    params = common.HYPERPARAMS["pong"]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=True,
                        action="store_true",
                        help="Enable cuda")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine_, batch):
        optimizer.zero_grad()
        loss_v = common.calc_loss_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine_.state.iteration)
        if engine_.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine, params, exp_source, NAME)
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Пример #2
0
        reward = 0.0
        steps = 0

        while True:
            acts, _ = agent([obs])
            obs, r, is_done, _ = test_env.step(acts[0])
            reward += r
            steps += 1
            if is_done:
                break
        test_reward_avg = getattr(engine.state, "test_reward_avg", None)
        if test_reward_avg is None:
            test_reward_avg = reward
        else:
            test_reward_avg = test_reward_avg * 0.95 + 0.05 * reward
        engine.state.test_reward_avg = test_reward_avg
        print("Test done: got %.3f reward after %d steps, avg reward %.3f" %
              (reward, steps, test_reward_avg))
        engine.state.metrics["test_reward"] = reward
        engine.state.metrics["avg_test_reward"] = test_reward_avg
        engine.state.metrics["test_steps"] = steps

        if test_reward_avg > params.stop_test_reward:
            print("Reward boundary has crossed, stopping training. Contgrats!")
            engine.should_terminate = True
        net.train(True)

    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Пример #3
0
    def validate(engine: Engine):
        res = validation.validation_run(env_tst, net, device=device)
        print("%d: tst: %s" % (engine.state.iteration, res))
        for key, val in res.items():
            engine.state.metrics[key + "_tst"] = val
        res = validation.validation_run(env_val, net, device=device)
        print("%d: val: %s" % (engine.state.iteration, res))
        for key, val in res.items():
            engine.state.metrics[key + "_val"] = val
        val_reward = res["episode_reward"]
        if getattr(engine.state, "best_val_reward", None) is None:
            engine.state.best_val_reward = val_reward
        if engine.state.best_val_reward < val_reward:
            print("Best validation reward updated: %.3f -> %.3f, model saved" %
                  (engine.state.best_val_reward, val_reward))
            engine.state.best_val_reward = val_reward
            path = saves_path / ("val_reward-%.3f.data" % val_reward)
            torch.save(net.state_dict(), path)

    event = ptan.ignite.PeriodEvents.ITERS_10000_COMPLETED
    tst_metrics = [m + "_tst" for m in validation.METRICS]
    tst_handler = tb_logger.OutputHandler(tag="test", metric_names=tst_metrics)
    tb.attach(engine, log_handler=tst_handler, event_name=event)

    val_metrics = [m + "_val" for m in validation.METRICS]
    val_handler = tb_logger.OutputHandler(tag="validation",
                                          metric_names=val_metrics)
    tb.attach(engine, log_handler=val_handler, event_name=event)

    engine.run(common.batch_generator(buffer, REPLAY_INITIAL, BATCH_SIZE))
Пример #4
0
def main():
    NAME = "03_double"
    STATES_TO_EVALUATE = 1000
    EVAL_EVERY_FRAME = 100

    random.seed(common.SEED)
    torch.manual_seed(common.SEED)
    params = common.HYPERPARAMS["pong"]
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Enable cuda")
    parser.add_argument("--double",
                        default=False,
                        action="store_true",
                        help="Enable double dqn")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params.env_name)
    env = ptan.common.wrappers.wrap_dqn(env)
    env.seed(common.SEED)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params.epsilon_start)
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=params.gamma)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params.replay_size)
    optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)

    def process_batch(engine_, batch):
        optimizer.zero_grad()
        loss_v = calc_loss_double_dqn(batch,
                                      net,
                                      tgt_net.target_model,
                                      gamma=params.gamma,
                                      device=device,
                                      double=args.double)
        loss_v.backward()
        optimizer.step()
        epsilon_tracker.frame(engine_.state.iteration)
        if engine_.state.iteration % params.target_net_sync == 0:
            tgt_net.sync()
        if engine_.state.iteration % EVAL_EVERY_FRAME == 0:
            eval_states = getattr(engine_.state, "eval_states", None)
            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
                eval_states = [
                    np.array(transition.state, copy=False)
                    for transition in eval_states
                ]
                eval_states = np.array(eval_states, copy=False)
                engine_.state.eval_states = eval_states
            engine_.state.metrics["values"] = common.calc_values_of_states(
                eval_states, net, device)
        return {
            "loss": loss_v.item(),
            "epsilon": selector.epsilon,
        }

    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        params,
                        exp_source,
                        f"{NAME}={args.double}",
                        extra_metrics=("values", ))
    engine.run(
        common.batch_generator(buffer, params.replay_initial,
                               params.batch_size))
Пример #5
0
    engine = Engine(process_batch)
    common.setup_ignite(engine,
                        PARAMS,
                        exp_source,
                        args.name,
                        extra_metrics=("test_reward", "test_steps"))
    best_test_reward = None

    @engine.on(ptan_ignite.PeriodEvents.ITERS_10000_COMPLETED)
    def test_network(engine):
        net.train(False)
        reward, steps = test_model(net, device, config)
        net.train(True)
        engine.state.metrics["test_reward"] = reward
        engine.state.metrics["test_steps"] = steps
        print("Test done: got %.3f reward after %.2f steps" % (reward, steps))

        global best_test_reward
        if best_test_reward is None:
            best_test_reward = reward
        elif best_test_reward < reward:
            print("Best test reward updated %.3f <- %.3f, save model" %
                  (best_test_reward, reward))
            best_test_reward = reward
            torch.save(net.state_dict(),
                       os.path.join(saves_path, "best_%.3f.dat" % reward))

    engine.run(
        common.batch_generator(buffer, PARAMS.replay_initial,
                               PARAMS.batch_size))
Пример #6
0
            obs, r, is_done, _ = val_env.step(act)
            steps += 1
            reward += r
            if is_done:
                break
        engine.state.metrics["val_reward"] = reward
        engine.state.metrics["val_steps"] = steps
        print("Validation got %.3f reward in %d steps" % (reward, steps))
        best_val_reward = getattr(engine.state, "best_val_reward", None)
        if best_val_reward is None:
            engine.state.best_val_reward = reward
        elif best_val_reward < reward:
            print("Best validation reward updated: %s -> %s" % (best_val_reward, reward))
            save_prep_name = save_path / ("best_val_%.3f_p.dat" % reward)
            save_net_name = save_path / ("best_val_%.3f_n.dat" % reward)
            torch.save(prep.state_dict(), save_prep_name)
            torch.save(net.state_dict(), save_net_name)
            engine.state.best_val_reward = reward

    @engine.on(ptan.ignite.EpisodeEvents.BEST_REWARD_REACHED)
    def best_reward_updated(trainer: Engine):
        reward = trainer.state.metrics["avg_reward"]
        if reward > 0:
            save_prep_name = save_path / ("best_train_%.3f_p.dat" % reward)
            save_net_name = save_path / ("best_train_%.3f_n.dat" % reward)
            torch.save(prep.state_dict(), save_prep_name)
            torch.save(net.state_dict(), save_net_name)
            print("%d: best avg training reward: %.3f, saved" % (trainer.state.iteration, reward))

    engine.run(common.batch_generator(buffer, params.replay_initial, BATCH_SIZE))