예제 #1
0
def test_stack(size=5, bufsize=9, stack_num=4, cached_num=3):
    env = MyTestEnv(size)
    buf = ReplayBuffer(bufsize, stack_num=stack_num)
    buf2 = ReplayBuffer(bufsize, stack_num=stack_num, sample_avail=True)
    buf3 = ReplayBuffer(bufsize, stack_num=stack_num, save_only_last_obs=True)
    obs = env.reset(1)
    for i in range(16):
        obs_next, rew, done, info = env.step(1)
        buf.add(obs, 1, rew, done, None, info)
        buf2.add(obs, 1, rew, done, None, info)
        buf3.add([None, None, obs], 1, rew, done, [None, obs], info)
        obs = obs_next
        if done:
            obs = env.reset(1)
    indice = np.arange(len(buf))
    assert np.allclose(
        buf.get(indice, 'obs')[..., 0],
        [[1, 1, 1, 2], [1, 1, 2, 3], [1, 2, 3, 4], [1, 1, 1, 1], [1, 1, 1, 2],
         [1, 1, 2, 3], [1, 2, 3, 4], [4, 4, 4, 4], [1, 1, 1, 1]])
    assert np.allclose(buf.get(indice, 'obs'), buf3.get(indice, 'obs'))
    assert np.allclose(buf.get(indice, 'obs'), buf3.get(indice, 'obs_next'))
    _, indice = buf2.sample(0)
    assert indice.tolist() == [2, 6]
    _, indice = buf2.sample(1)
    assert indice[0] in [2, 6]
    batch, indice = buf2.sample(-1)  # neg bsz -> no data
    assert indice.tolist() == [] and len(batch) == 0
    with pytest.raises(IndexError):
        buf[bufsize * 2]
예제 #2
0
def test_a2c(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    print("Action range:", np.min(env.action_space.low),
          np.max(env.action_space.high))
    # train_envs = gym.make(args.task)
    train_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)],
        norm_obs=True)
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)],
        norm_obs=True,
        obs_rms=train_envs.obs_rms,
        update_obs_rms=False)

    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net_a = Net(args.state_shape,
                hidden_sizes=args.hidden_sizes,
                activation=nn.Tanh,
                device=args.device)
    actor = ActorProb(net_a,
                      args.action_shape,
                      max_action=args.max_action,
                      unbounded=True,
                      device=args.device).to(args.device)
    net_c = Net(args.state_shape,
                hidden_sizes=args.hidden_sizes,
                activation=nn.Tanh,
                device=args.device)
    critic = Critic(net_c, device=args.device).to(args.device)
    torch.nn.init.constant_(actor.sigma_param, -0.5)
    for m in list(actor.modules()) + list(critic.modules()):
        if isinstance(m, torch.nn.Linear):
            # orthogonal initialization
            torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
            torch.nn.init.zeros_(m.bias)
    # do last policy layer scaling, this will make initial actions have (close to)
    # 0 mean and std, and will help boost performances,
    # see https://arxiv.org/abs/2006.05990, Fig.24 for details
    for m in actor.mu.modules():
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.zeros_(m.bias)
            m.weight.data.copy_(0.01 * m.weight.data)

    optim = torch.optim.RMSprop(set(actor.parameters()).union(
        critic.parameters()),
                                lr=args.lr,
                                eps=1e-5,
                                alpha=0.99)

    lr_scheduler = None
    if args.lr_decay:
        # decay learning rate to 0 linearly
        max_update_num = np.ceil(
            args.step_per_epoch / args.step_per_collect) * args.epoch

        lr_scheduler = LambdaLR(
            optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num)

    def dist(*logits):
        return Independent(Normal(*logits), 1)

    policy = A2CPolicy(actor,
                       critic,
                       optim,
                       dist,
                       discount_factor=args.gamma,
                       gae_lambda=args.gae_lambda,
                       max_grad_norm=args.max_grad_norm,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       reward_normalization=args.rew_norm,
                       action_scaling=True,
                       action_bound_method=args.bound_action_method,
                       lr_scheduler=lr_scheduler,
                       action_space=env.action_space)

    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)

    # collector
    if args.training_num > 1:
        buffer = VectorReplayBuffer(args.buffer_size, len(train_envs))
    else:
        buffer = ReplayBuffer(args.buffer_size)
    train_collector = Collector(policy,
                                train_envs,
                                buffer,
                                exploration_noise=True)
    test_collector = Collector(policy, test_envs)
    # log
    t0 = datetime.datetime.now().strftime("%m%d_%H%M%S")
    log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_a2c'
    log_path = os.path.join(args.logdir, args.task, 'a2c', log_file)
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = BasicLogger(writer, update_interval=100, train_interval=100)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    if not args.watch:
        # trainer
        result = onpolicy_trainer(policy,
                                  train_collector,
                                  test_collector,
                                  args.epoch,
                                  args.step_per_epoch,
                                  args.repeat_per_collect,
                                  args.test_num,
                                  args.batch_size,
                                  step_per_collect=args.step_per_collect,
                                  save_fn=save_fn,
                                  logger=logger,
                                  test_in_train=False)
        pprint.pprint(result)

    # Let's watch its performance!
    policy.eval()
    test_envs.seed(args.seed)
    test_collector.reset()
    result = test_collector.collect(n_episode=args.test_num,
                                    render=args.render)
    print(
        f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}'
    )
예제 #3
0
def test_episodic_returns(size=2560):
    fn = BasePolicy.compute_episodic_return
    buf = ReplayBuffer(20)
    batch = Batch(
        done=np.array([1, 0, 0, 1, 0, 1, 0, 1.]),
        rew=np.array([0, 1, 2, 3, 4, 5, 6, 7.]),
        info=Batch(
            {
                'TimeLimit.truncated':
                np.array([False, False, False, False, False, True, False, False])
            }
        )
    )
    for b in batch:
        b.obs = b.act = 1
        buf.add(b)
    returns, _ = fn(batch, buf, buf.sample_indices(0), gamma=.1, gae_lambda=1)
    ans = np.array([0, 1.23, 2.3, 3, 4.5, 5, 6.7, 7])
    assert np.allclose(returns, ans)
    buf.reset()
    batch = Batch(
        done=np.array([0, 1, 0, 1, 0, 1, 0.]),
        rew=np.array([7, 6, 1, 2, 3, 4, 5.]),
    )
    for b in batch:
        b.obs = b.act = 1
        buf.add(b)
    returns, _ = fn(batch, buf, buf.sample_indices(0), gamma=.1, gae_lambda=1)
    ans = np.array([7.6, 6, 1.2, 2, 3.4, 4, 5])
    assert np.allclose(returns, ans)
    buf.reset()
    batch = Batch(
        done=np.array([0, 1, 0, 1, 0, 0, 1.]),
        rew=np.array([7, 6, 1, 2, 3, 4, 5.]),
    )
    for b in batch:
        b.obs = b.act = 1
        buf.add(b)
    returns, _ = fn(batch, buf, buf.sample_indices(0), gamma=.1, gae_lambda=1)
    ans = np.array([7.6, 6, 1.2, 2, 3.45, 4.5, 5])
    assert np.allclose(returns, ans)
    buf.reset()
    batch = Batch(
        done=np.array([0, 0, 0, 1., 0, 0, 0, 1, 0, 0, 0, 1]),
        rew=np.array([101, 102, 103., 200, 104, 105, 106, 201, 107, 108, 109, 202]),
    )
    for b in batch:
        b.obs = b.act = 1
        buf.add(b)
    v = np.array([2., 3., 4, -1, 5., 6., 7, -2, 8., 9., 10, -3])
    returns, _ = fn(batch, buf, buf.sample_indices(0), v, gamma=0.99, gae_lambda=0.95)
    ground_truth = np.array(
        [
            454.8344, 376.1143, 291.298, 200., 464.5610, 383.1085, 295.387, 201.,
            474.2876, 390.1027, 299.476, 202.
        ]
    )
    assert np.allclose(returns, ground_truth)
    buf.reset()
    batch = Batch(
        done=np.array([0, 0, 0, 1., 0, 0, 0, 1, 0, 0, 0, 1]),
        rew=np.array([101, 102, 103., 200, 104, 105, 106, 201, 107, 108, 109, 202]),
        info=Batch(
            {
                'TimeLimit.truncated':
                np.array(
                    [
                        False, False, False, True, False, False, False, True, False,
                        False, False, False
                    ]
                )
            }
        )
    )
    for b in batch:
        b.obs = b.act = 1
        buf.add(b)
    v = np.array([2., 3., 4, -1, 5., 6., 7, -2, 8., 9., 10, -3])
    returns, _ = fn(batch, buf, buf.sample_indices(0), v, gamma=0.99, gae_lambda=0.95)
    ground_truth = np.array(
        [
            454.0109, 375.2386, 290.3669, 199.01, 462.9138, 381.3571, 293.5248, 199.02,
            474.2876, 390.1027, 299.476, 202.
        ]
    )
    assert np.allclose(returns, ground_truth)

    if __name__ == '__main__':
        buf = ReplayBuffer(size)
        batch = Batch(
            done=np.random.randint(100, size=size) == 0,
            rew=np.random.random(size),
        )
        for b in batch:
            b.obs = b.act = 1
            buf.add(b)
        indices = buf.sample_indices(0)

        def vanilla():
            return compute_episodic_return_base(batch, gamma=.1)

        def optimized():
            return fn(batch, buf, indices, gamma=.1, gae_lambda=1.0)

        cnt = 3000
        print('GAE vanilla', timeit(vanilla, setup=vanilla, number=cnt))
        print('GAE optim  ', timeit(optimized, setup=optimized, number=cnt))
예제 #4
0
def test_sac_with_il(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = ActorProb(net, args.action_shape, args.max_action,
                      args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net = Net(args.layer_num,
              args.state_shape,
              args.action_shape,
              concat=True,
              device=args.device)
    critic1 = Critic(net, args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(net, args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = SACPolicy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        action_range=[env.action_space.low[0], env.action_space.high[0]],
        tau=args.tau,
        gamma=args.gamma,
        alpha=args.alpha,
        reward_normalization=args.rew_norm,
        ignore_done=args.ignore_done,
        estimation_step=args.n_step)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'sac')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')

    # here we define an imitation collector with a trivial policy
    policy.eval()
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -300  # lower the goal
    net = Actor(Net(1, args.state_shape,
                    device=args.device), args.action_shape, args.max_action,
                args.device).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.il_lr)
    il_policy = ImitationPolicy(net, optim, mode='continuous')
    il_test_collector = Collector(
        il_policy,
        DummyVectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.test_num)]))
    train_collector.reset()
    result = offpolicy_trainer(il_policy,
                               train_collector,
                               il_test_collector,
                               args.epoch,
                               args.step_per_epoch // 5,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        il_policy.eval()
        collector = Collector(il_policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
예제 #5
0
def test_ppo(args=get_args()):
    env = create_atari_environment(args.task,
                                   max_episode_steps=args.max_episode_steps)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space().shape or env.action_space().n
    # train_envs = gym.make(args.task)
    train_envs = SubprocVectorEnv([
        lambda: create_atari_environment(
            args.task, max_episode_steps=args.max_episode_steps)
        for _ in range(args.training_num)
    ])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv([
        lambda: create_atari_environment(
            args.task, max_episode_steps=args.max_episode_steps)
        for _ in range(args.test_num)
    ])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = CNN(args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PPOPolicy(actor,
                       critic,
                       optim,
                       dist,
                       args.gamma,
                       max_grad_norm=args.max_grad_norm,
                       eps_clip=args.eps_clip,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       action_range=None)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ppo', args.note)
    writer = SummaryWriter(log_path)

    def stop_fn(x):
        if env.env.spec.reward_threshold:
            return x >= env.env.spec.reward_threshold
        else:
            return False

    def save_fn(pol):
        torch.save(pol.state_dict(), os.path.join(log_path, 'policy.pth'))

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              save_fn=save_fn,
                              writer=writer,
                              task=args.task)
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = create_atari_environment(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_step=2000, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
예제 #6
0
def test_ppo(args=get_args()):
    env = gym.make(args.task)

    train_envs = gym.make(args.task)
    args.state_shape = get_observation_size(train_envs)
    args.action_shape = get_action_size(train_envs)
    test_envs = gym.make(args.task)
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net_Dis(args.layer_num, args.state_shape, device=args.device)
    actor = Actor_Dis(net, args.action_shape).to(args.device)
    critic = Critic_Dis(net).to(args.device)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PPOPolicy(actor,
                       critic,
                       optim,
                       dist,
                       args.gamma,
                       max_grad_norm=args.max_grad_norm,
                       eps_clip=args.eps_clip,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       action_range=None)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'ppo')

    def stop_fn(x):
        if env.env.spec.reward_threshold:
            return x >= env.spec.reward_threshold
        else:
            return False

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              writer=writer,
                              task=args.task)
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_step=2000, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
예제 #7
0
def test_td3(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym_make()

    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.action_range = [env.action_space.low[0], env.action_space.high[0]]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym_make()
    train_envs = VectorEnv(
        [lambda: gym_make() for _ in range(args.training_num)])
    # test_envs = gym_make()
    test_envs = VectorEnv([lambda: gym_make() for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    policy = init_policy(args, env)

    if args.train == 1:
        # collector
        train_collector = Collector(policy, train_envs,
                                    ReplayBuffer(args.buffer_size))
        test_collector = Collector(policy, test_envs)
        # train_collector.collect(n_step=args.buffer_size)
        # log
        log_path = os.path.join(args.logdir, args.task, 'td3')
        writer = SummaryWriter(log_path)

        def save_fn(policy):
            torch.save(policy.state_dict(),
                       os.path.join(log_path, 'policy.pth'))

        # trainer
        result = offpolicy_trainer(policy,
                                   train_collector,
                                   test_collector,
                                   args.epoch,
                                   args.step_per_epoch,
                                   args.collect_per_step,
                                   args.test_num,
                                   args.batch_size,
                                   stop_fn=None,
                                   save_fn=save_fn,
                                   writer=writer)

        train_collector.close()
        test_collector.close()

    else:
        log_path = os.path.join(args.logdir, args.task, 'td3')
        policy.load_state_dict(
            torch.load(os.path.join(log_path, 'policy.pth'),
                       map_location=args.device))
        # Let's watch its performance!
        env = gym_make()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
예제 #8
0
def test_multibuf_hdf5():
    size = 100
    buffers = {
        "vector": ReplayBufferManager([ReplayBuffer(size) for i in range(4)]),
        "cached": CachedReplayBuffer(ReplayBuffer(size), 4, size)
    }
    buffer_types = {k: b.__class__ for k, b in buffers.items()}
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    info_t = torch.tensor([1.]).to(device)
    for i in range(4):
        kwargs = {
            'obs': Batch(index=np.array([i])),
            'act': i,
            'rew': np.array([1, 2]),
            'done': i % 3 == 2,
            'info': {
                "number": {
                    "n": i,
                    "t": info_t
                },
                'extra': None
            },
        }
        buffers["vector"].add(**Batch.stack([kwargs, kwargs, kwargs]),
                              buffer_ids=[0, 1, 2])
        buffers["cached"].add(**Batch.stack([kwargs, kwargs, kwargs]),
                              cached_buffer_ids=[0, 1, 2])

    # save
    paths = {}
    for k, buf in buffers.items():
        f, path = tempfile.mkstemp(suffix='.hdf5')
        os.close(f)
        buf.save_hdf5(path)
        paths[k] = path

    # load replay buffer
    _buffers = {k: buffer_types[k].load_hdf5(paths[k]) for k in paths.keys()}

    # compare
    for k in buffers.keys():
        assert len(_buffers[k]) == len(buffers[k])
        assert np.allclose(_buffers[k].act, buffers[k].act)
        assert _buffers[k].stack_num == buffers[k].stack_num
        assert _buffers[k].maxsize == buffers[k].maxsize
        assert np.all(_buffers[k]._indices == buffers[k]._indices)
    # check shallow copy in ReplayBufferManager
    for k in ["vector", "cached"]:
        buffers[k].info.number.n[0] = -100
        assert buffers[k].buffers[0].info.number.n[0] == -100
    # check if still behave normally
    for k in ["vector", "cached"]:
        kwargs = {
            'obs': Batch(index=np.array([5])),
            'act': 5,
            'rew': np.array([2, 1]),
            'done': False,
            'info': {
                "number": {
                    "n": i
                },
                'Timelimit.truncate': True
            },
        }
        buffers[k].add(**Batch.stack([kwargs, kwargs, kwargs, kwargs]))
        act = np.zeros(buffers[k].maxsize)
        if k == "vector":
            act[np.arange(5)] = np.array([0, 1, 2, 3, 5])
            act[np.arange(5) + size] = np.array([0, 1, 2, 3, 5])
            act[np.arange(5) + size * 2] = np.array([0, 1, 2, 3, 5])
            act[size * 3] = 5
        elif k == "cached":
            act[np.arange(9)] = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2])
            act[np.arange(3) + size] = np.array([3, 5, 2])
            act[np.arange(3) + size * 2] = np.array([3, 5, 2])
            act[np.arange(3) + size * 3] = np.array([3, 5, 2])
            act[size * 4] = 5
        assert np.allclose(buffers[k].act, act)

    for path in paths.values():
        os.remove(path)
예제 #9
0
def test_a2c_with_il(args=get_args()):
    torch.set_num_threads(1)  # for poor CPU
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    # train_envs = DummyVectorEnv(
    #     [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # # test_envs = gym.make(args.task)
    # test_envs = DummyVectorEnv(
    #     [lambda: gym.make(args.task) for _ in range(args.test_num)])
    if args.data_quantity != 0:
        env.set_data_quantity(args.data_quantity)
    if args.data_quality != 0:
        env.set_data_quality(args.data_quality)
    if args.psi != 0:
        env.set_psi(args.psi)
    if args.nu != 0:
        env.set_nu(args.nu)
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # train_envs.seed(args.seed)
    # test_envs.seed(args.seed)
    # model

    # server policy
    server_policy = build_policy(0, args)

    # client policy
    ND_policy = build_policy(1, args)
    RD_policy = build_policy(2, args)
    FD_policy = build_policy(3, args)
    # 不用collector,用replaybuffer
    server_buffer = ReplayBuffer(args.buffer_size)
    ND_buffer = ReplayBuffer(args.buffer_size)
    RD_buffer = ReplayBuffer(args.buffer_size)
    FD_buffer = ReplayBuffer(args.buffer_size)

    # log
    log_path = os.path.join(args.logdir, args.task, 'a2c')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    start_time = time.time()
    _server_obs, _ND_obs, _RD_obs, _FD_obs = env.reset()
    _server_act = _server_rew = _done = _info = None
    server_buffer.reset()
    _ND_act = _ND_rew = _RD_act = _RD_rew = _FD_act = _FD_rew = [None]
    ND_buffer.reset()
    RD_buffer.reset()
    FD_buffer.reset()
    all_server_costs = []
    all_ND_utility = []
    all_RD_utility = []
    all_FD_utility = []
    all_leak_probability = []
    for epoch in range(1, 1 + args.epoch):
        # 每个epoch收集N*T数据,然后用B训练M次
        server_costs = []
        ND_utility = []
        FD_utility = []
        RD_utility = []
        leak_probability = []
        payment = []
        expected_time = []
        training_time = []
        for e in range(5):
            with tqdm.tqdm(total=args.step_per_epoch,
                           desc=f'Epoch #{epoch}',
                           **tqdm_config) as t:
                while t.n < t.total:
                    # 收集数据,不用梯度
                    # server
                    _server_obs, _ND_obs, _RD_obs, _FD_obs = env.reset()
                    server_batch = Batch(obs=_server_obs,
                                         act=_server_act,
                                         rew=_server_rew,
                                         done=_done,
                                         obs_next=None,
                                         info=_info,
                                         policy=None)
                    with torch.no_grad():
                        server_result = server_policy(server_batch, None)
                    _server_policy = [{}]
                    _server_act = to_numpy(server_result.act)
                    # ND
                    ND_batch = Batch(obs=_ND_obs,
                                     act=_ND_act,
                                     rew=_ND_rew,
                                     done=_done,
                                     obs_next=None,
                                     info=_info,
                                     policy=None)
                    with torch.no_grad():
                        ND_result = ND_policy(ND_batch, None)
                    _ND_policy = [{}]
                    _ND_act = to_numpy(ND_result.act)
                    # RD
                    RD_batch = Batch(obs=_RD_obs,
                                     act=_RD_act,
                                     rew=_RD_rew,
                                     done=_done,
                                     obs_next=None,
                                     info=_info,
                                     policy=None)
                    with torch.no_grad():
                        RD_result = RD_policy(RD_batch, None)
                    _RD_policy = [{}]
                    _RD_act = to_numpy(RD_result.act)
                    # FD
                    FD_batch = Batch(obs=_FD_obs,
                                     act=_FD_act,
                                     rew=_FD_rew,
                                     done=_done,
                                     obs_next=None,
                                     info=_info,
                                     policy=None)
                    with torch.no_grad():
                        FD_result = FD_policy(FD_batch, None)
                    _FD_policy = [{}]
                    _FD_act = to_numpy(FD_result.act)
                    # print(_ND_act.shape)
                    server_obs_next, ND_obs_next, RD_obs_next, FD_obs_next, _server_rew, _client_rew, _done, _info = env.step(
                        _server_act[0], _ND_act[0], _RD_act[0], _FD_act[0])

                    server_costs.append(_server_rew)
                    ND_utility.append(_client_rew[0])
                    RD_utility.append(_client_rew[1])
                    FD_utility.append(_client_rew[2])
                    leak_probability.append(_info[0]["leak"])
                    payment.append(env.payment)
                    expected_time.append(env.expected_time)
                    training_time.append(env.global_time * env.time_lambda)
                    # 加入replay buffer
                    server_buffer.add(
                        Batch(obs=_server_obs[0],
                              act=_server_act[0],
                              rew=_server_rew[0],
                              done=_done[0],
                              obs_next=server_obs_next[0],
                              info=_info[0],
                              policy=_server_policy[0]))
                    ND_buffer.add(
                        Batch(obs=_ND_obs[0],
                              act=_ND_act[0],
                              rew=_client_rew[0],
                              done=_done[0],
                              obs_next=ND_obs_next[0],
                              info=_info[0],
                              policy=_ND_policy[0]))
                    RD_buffer.add(
                        Batch(obs=_RD_obs[0],
                              act=_RD_act[0],
                              rew=_client_rew[1],
                              done=_done[0],
                              obs_next=RD_obs_next[0],
                              info=_info[0],
                              policy=_RD_policy[0]))
                    FD_buffer.add(
                        Batch(obs=_FD_obs[0],
                              act=_FD_act[0],
                              rew=_client_rew[2],
                              done=_done[0],
                              obs_next=FD_obs_next[0],
                              info=_info[0],
                              policy=_FD_policy[0]))
                    t.update(1)
                    _server_obs = server_obs_next
                    _ND_obs = ND_obs_next
                    _RD_obs = RD_obs_next
                    _FD_obs = FD_obs_next
        all_server_costs.append(np.array(server_costs).mean())
        all_ND_utility.append(np.array(ND_utility).mean())
        all_RD_utility.append(np.array(RD_utility).mean())
        all_FD_utility.append(np.array(FD_utility).mean())
        all_leak_probability.append(np.array(leak_probability).mean())
        print("current bandwidth:", env.bandwidth)
        print("leak signal:", env.leak_NU, env.leak_FU)
        print("current server cost:", np.array(server_costs).mean())
        print("current device utility:", all_ND_utility[-1],
              all_RD_utility[-1], all_FD_utility[-1])
        print("leak probability:", all_leak_probability[-1])
        print("server_act:", _server_act[0])
        print("device_acts:", _ND_act[0], _RD_act[0], _FD_act[0])
        print("payment cost:", np.array(payment).mean())
        print("Expected time cost:", np.array(expected_time).mean())
        print("Training time cost:", np.array(training_time).mean())
        # print("server_act:",_server_act)
        # print("client_act:",_client_act)
        print("info:", env.communication_time, env.computation_time,
              env.K_theta)
        server_batch_data, server_indice = server_buffer.sample(0)
        server_batch_data = server_policy.process_fn(server_batch_data,
                                                     server_buffer,
                                                     server_indice)
        server_policy.learn(server_batch_data, args.batch_size,
                            args.repeat_per_collect)
        # server_buffer.reset()

        ND_batch_data, ND_indice = ND_buffer.sample(0)
        ND_batch_data = ND_policy.process_fn(ND_batch_data, ND_buffer,
                                             ND_indice)
        ND_policy.learn(ND_batch_data, args.batch_size,
                        args.repeat_per_collect)
        # ND_buffer.reset()

        RD_batch_data, RD_indice = RD_buffer.sample(0)
        RD_batch_data = RD_policy.process_fn(RD_batch_data, RD_buffer,
                                             RD_indice)
        RD_policy.learn(RD_batch_data, args.batch_size,
                        args.repeat_per_collect)
        # RD_buffer.reset()

        FD_batch_data, FD_indice = FD_buffer.sample(0)
        FD_batch_data = FD_policy.process_fn(FD_batch_data, FD_buffer,
                                             FD_indice)
        FD_policy.learn(FD_batch_data, args.batch_size,
                        args.repeat_per_collect)
        # FD_buffer.reset()
    print("all_server_cost:", all_server_costs)
    print("all_ND_utility:", all_ND_utility)
    print("all_RD_utility:", all_RD_utility)
    print("all_FD_utility:", all_FD_utility)
    print("all_leak_probability:", all_leak_probability)
    plt.plot(all_server_costs)
    plt.show()
예제 #10
0
def test_cachedbuffer():
    buf = CachedReplayBuffer(ReplayBuffer(10), 4, 5)
    assert buf.sample_index(0).tolist() == []
    # check the normal function/usage/storage in CachedReplayBuffer
    ep_len, ep_rew = buf.add(obs=[1],
                             act=[1],
                             rew=[1],
                             done=[0],
                             cached_buffer_ids=[1])
    obs = np.zeros(buf.maxsize)
    obs[15] = 1
    indice = buf.sample_index(0)
    assert np.allclose(indice, [15])
    assert np.allclose(buf.prev(indice), [15])
    assert np.allclose(buf.next(indice), [15])
    assert np.allclose(buf.obs, obs)
    assert np.allclose(ep_len, [0]) and np.allclose(ep_rew, [0.0])
    ep_len, ep_rew = buf.add(obs=[2],
                             act=[2],
                             rew=[2],
                             done=[1],
                             cached_buffer_ids=[3])
    obs[[0, 25]] = 2
    indice = buf.sample_index(0)
    assert np.allclose(indice, [0, 15])
    assert np.allclose(buf.prev(indice), [0, 15])
    assert np.allclose(buf.next(indice), [0, 15])
    assert np.allclose(buf.obs, obs)
    assert np.allclose(ep_len, [1]) and np.allclose(ep_rew, [2.0])
    assert np.allclose(buf.unfinished_index(), [15])
    assert np.allclose(buf.sample_index(0), [0, 15])
    ep_len, ep_rew = buf.add(obs=[3, 4],
                             act=[3, 4],
                             rew=[3, 4],
                             done=[0, 1],
                             cached_buffer_ids=[3, 1])
    assert np.allclose(ep_len, [0, 2]) and np.allclose(ep_rew, [0, 5.0])
    obs[[0, 1, 2, 15, 16, 25]] = [2, 1, 4, 1, 4, 3]
    assert np.allclose(buf.obs, obs)
    assert np.allclose(buf.unfinished_index(), [25])
    indice = buf.sample_index(0)
    assert np.allclose(indice, [0, 1, 2, 25])
    assert np.allclose(buf.done[indice], [1, 0, 1, 0])
    assert np.allclose(buf.prev(indice), [0, 1, 1, 25])
    assert np.allclose(buf.next(indice), [0, 2, 2, 25])
    indice = buf.sample_index(10000)
    assert np.bincount(indice)[[0, 1, 2, 25]].min() > 2000  # uniform sample
    # cached buffer with main_buffer size == 0 (no update)
    # used in test_collector
    buf = CachedReplayBuffer(ReplayBuffer(0, sample_avail=True), 4, 5)
    data = np.zeros(4)
    rew = np.ones([4, 4])
    buf.add(obs=data, act=data, rew=rew, done=[0, 0, 1, 1], obs_next=data)
    buf.add(obs=data, act=data, rew=rew, done=[0, 0, 0, 0], obs_next=data)
    buf.add(obs=data, act=data, rew=rew, done=[1, 1, 1, 1], obs_next=data)
    buf.add(obs=data, act=data, rew=rew, done=[0, 0, 0, 0], obs_next=data)
    buf.add(obs=data, act=data, rew=rew, done=[0, 1, 0, 1], obs_next=data)
    assert np.allclose(buf.done, [
        0,
        0,
        1,
        0,
        0,
        0,
        1,
        1,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        1,
        0,
        0,
        0,
    ])
    indice = buf.sample_index(0)
    assert np.allclose(indice, [0, 1, 10, 11])
    assert np.allclose(buf.prev(indice), [0, 0, 10, 10])
    assert np.allclose(buf.next(indice), [1, 1, 11, 11])
예제 #11
0
def test_multibuf_stack():
    size = 5
    bufsize = 9
    stack_num = 4
    cached_num = 3
    env = MyTestEnv(size)
    # test if CachedReplayBuffer can handle stack_num + ignore_obs_next
    buf4 = CachedReplayBuffer(
        ReplayBuffer(bufsize, stack_num=stack_num, ignore_obs_next=True),
        cached_num, size)
    # test if CachedReplayBuffer can handle super corner case:
    # prio-buffer + stack_num + ignore_obs_next + sample_avail
    buf5 = CachedReplayBuffer(
        PrioritizedReplayBuffer(bufsize,
                                0.6,
                                0.4,
                                stack_num=stack_num,
                                ignore_obs_next=True,
                                sample_avail=True), cached_num, size)
    obs = env.reset(1)
    for i in range(18):
        obs_next, rew, done, info = env.step(1)
        obs_list = np.array([obs + size * i for i in range(cached_num)])
        act_list = [1] * cached_num
        rew_list = [rew] * cached_num
        done_list = [done] * cached_num
        obs_next_list = -obs_list
        info_list = [info] * cached_num
        buf4.add(obs_list, act_list, rew_list, done_list, obs_next_list,
                 info_list)
        buf5.add(obs_list, act_list, rew_list, done_list, obs_next_list,
                 info_list)
        obs = obs_next
        if done:
            obs = env.reset(1)
    # check the `add` order is correct
    assert np.allclose(
        buf4.obs.reshape(-1),
        [
            12,
            13,
            14,
            4,
            6,
            7,
            8,
            9,
            11,  # main_buffer
            1,
            2,
            3,
            4,
            0,  # cached_buffer[0]
            6,
            7,
            8,
            9,
            0,  # cached_buffer[1]
            11,
            12,
            13,
            14,
            0,  # cached_buffer[2]
        ]), buf4.obs
    assert np.allclose(
        buf4.done,
        [
            0,
            0,
            1,
            1,
            0,
            0,
            0,
            1,
            0,  # main_buffer
            0,
            0,
            0,
            1,
            0,  # cached_buffer[0]
            0,
            0,
            0,
            1,
            0,  # cached_buffer[1]
            0,
            0,
            0,
            1,
            0,  # cached_buffer[2]
        ]), buf4.done
    assert np.allclose(buf4.unfinished_index(), [10, 15, 20])
    indice = sorted(buf4.sample_index(0))
    assert np.allclose(indice, list(range(bufsize)) + [9, 10, 14, 15, 19, 20])
    assert np.allclose(buf4[indice].obs[..., 0], [
        [11, 11, 11, 12],
        [11, 11, 12, 13],
        [11, 12, 13, 14],
        [4, 4, 4, 4],
        [6, 6, 6, 6],
        [6, 6, 6, 7],
        [6, 6, 7, 8],
        [6, 7, 8, 9],
        [11, 11, 11, 11],
        [1, 1, 1, 1],
        [1, 1, 1, 2],
        [6, 6, 6, 6],
        [6, 6, 6, 7],
        [11, 11, 11, 11],
        [11, 11, 11, 12],
    ])
    assert np.allclose(buf4[indice].obs_next[..., 0], [
        [11, 11, 12, 13],
        [11, 12, 13, 14],
        [11, 12, 13, 14],
        [4, 4, 4, 4],
        [6, 6, 6, 7],
        [6, 6, 7, 8],
        [6, 7, 8, 9],
        [6, 7, 8, 9],
        [11, 11, 11, 12],
        [1, 1, 1, 2],
        [1, 1, 1, 2],
        [6, 6, 6, 7],
        [6, 6, 6, 7],
        [11, 11, 11, 12],
        [11, 11, 11, 12],
    ])
    assert np.all(buf4.done == buf5.done)
    indice = buf5.sample_index(0)
    assert np.allclose(sorted(indice), [2, 7])
    assert np.all(np.isin(buf5.sample_index(100), indice))
    # manually change the stack num
    buf5.stack_num = 2
    for buf in buf5.buffers:
        buf.stack_num = 2
    indice = buf5.sample_index(0)
    assert np.allclose(sorted(indice), [0, 1, 2, 5, 6, 7, 10, 15, 20])
    batch, _ = buf5.sample(0)
    assert np.allclose(buf5[np.arange(buf5.maxsize)].weight, 1)
    buf5.update_weight(indice, batch.weight * 0)
    weight = buf5[np.arange(buf5.maxsize)].weight
    modified_weight = weight[[0, 1, 2, 5, 6, 7]]
    assert modified_weight.min() == modified_weight.max()
    assert modified_weight.max() < 1
    unmodified_weight = weight[[3, 4, 8]]
    assert unmodified_weight.min() == unmodified_weight.max()
    assert unmodified_weight.max() < 1
    cached_weight = weight[9:]
    assert cached_weight.min() == cached_weight.max() == 1
    # test Atari with CachedReplayBuffer, save_only_last_obs + ignore_obs_next
    buf6 = CachedReplayBuffer(
        ReplayBuffer(bufsize,
                     stack_num=stack_num,
                     save_only_last_obs=True,
                     ignore_obs_next=True), cached_num, size)
    obs = np.random.rand(size, 4, 84, 84)
    buf6.add(obs=[obs[2], obs[0]],
             act=[1, 1],
             rew=[0, 0],
             done=[0, 1],
             obs_next=[obs[3], obs[1]],
             cached_buffer_ids=[1, 2])
    assert buf6.obs.shape == (buf6.maxsize, 84, 84)
    assert np.allclose(buf6.obs[0], obs[0, -1])
    assert np.allclose(buf6.obs[14], obs[2, -1])
    assert np.allclose(buf6.obs[19], obs[0, -1])
    assert buf6[0].obs.shape == (4, 84, 84)
예제 #12
0
def test_replaybuffermanager():
    buf = ReplayBufferManager([ReplayBuffer(size=5) for i in range(4)])
    ep_len, ep_rew = buf.add(obs=[1, 2, 3],
                             act=[1, 2, 3],
                             rew=[1, 2, 3],
                             done=[0, 0, 1],
                             buffer_ids=[0, 1, 2])
    assert np.allclose(ep_len, [0, 0, 1]) and np.allclose(ep_rew, [0, 0, 3])
    with pytest.raises(NotImplementedError):
        # ReplayBufferManager cannot be updated
        buf.update(buf)
    # sample index / prev / next / unfinished_index
    indice = buf.sample_index(11000)
    assert np.bincount(indice)[[0, 5, 10]].min() >= 3000  # uniform sample
    batch, indice = buf.sample(0)
    assert np.allclose(indice, [0, 5, 10])
    indice_prev = buf.prev(indice)
    assert np.allclose(indice_prev, indice), indice_prev
    indice_next = buf.next(indice)
    assert np.allclose(indice_next, indice), indice_next
    assert np.allclose(buf.unfinished_index(), [0, 5])
    buf.add(obs=[4], act=[4], rew=[4], done=[1], buffer_ids=[3])
    assert np.allclose(buf.unfinished_index(), [0, 5])
    batch, indice = buf.sample(10)
    batch, indice = buf.sample(0)
    assert np.allclose(indice, [0, 5, 10, 15])
    indice_prev = buf.prev(indice)
    assert np.allclose(indice_prev, indice), indice_prev
    indice_next = buf.next(indice)
    assert np.allclose(indice_next, indice), indice_next
    data = np.array([0, 0, 0, 0])
    buf.add(obs=data, act=data, rew=data, done=data, buffer_ids=[0, 1, 2, 3])
    buf.add(obs=data,
            act=data,
            rew=data,
            done=1 - data,
            buffer_ids=[0, 1, 2, 3])
    assert len(buf) == 12
    buf.add(obs=data, act=data, rew=data, done=data, buffer_ids=[0, 1, 2, 3])
    buf.add(obs=data,
            act=data,
            rew=data,
            done=[0, 1, 0, 1],
            buffer_ids=[0, 1, 2, 3])
    assert len(buf) == 20
    indice = buf.sample_index(120000)
    assert np.bincount(indice).min() >= 5000
    batch, indice = buf.sample(10)
    indice = buf.sample_index(0)
    assert np.allclose(indice, np.arange(len(buf)))
    # check the actual data stored in buf._meta
    assert np.allclose(buf.done, [
        0,
        0,
        1,
        0,
        0,
        0,
        0,
        1,
        0,
        1,
        1,
        0,
        1,
        0,
        0,
        1,
        0,
        1,
        0,
        1,
    ])
    assert np.allclose(buf.prev(indice), [
        0,
        0,
        1,
        3,
        3,
        5,
        5,
        6,
        8,
        8,
        10,
        11,
        11,
        13,
        13,
        15,
        16,
        16,
        18,
        18,
    ])
    assert np.allclose(buf.next(indice), [
        1,
        2,
        2,
        4,
        4,
        6,
        7,
        7,
        9,
        9,
        10,
        12,
        12,
        14,
        14,
        15,
        17,
        17,
        19,
        19,
    ])
    assert np.allclose(buf.unfinished_index(), [4, 14])
    ep_len, ep_rew = buf.add(obs=[1],
                             act=[1],
                             rew=[1],
                             done=[1],
                             buffer_ids=[2])
    assert np.allclose(ep_len, [3]) and np.allclose(ep_rew, [1])
    assert np.allclose(buf.unfinished_index(), [4])
    indice = list(sorted(buf.sample_index(0)))
    assert np.allclose(indice, np.arange(len(buf)))
    assert np.allclose(buf.prev(indice), [
        0,
        0,
        1,
        3,
        3,
        5,
        5,
        6,
        8,
        8,
        14,
        11,
        11,
        13,
        13,
        15,
        16,
        16,
        18,
        18,
    ])
    assert np.allclose(buf.next(indice), [
        1,
        2,
        2,
        4,
        4,
        6,
        7,
        7,
        9,
        9,
        10,
        12,
        12,
        14,
        10,
        15,
        17,
        17,
        19,
        19,
    ])
    # corner case: list, int and -1
    assert buf.prev(-1) == buf.prev([buf.maxsize - 1])[0]
    assert buf.next(-1) == buf.next([buf.maxsize - 1])[0]
    batch = buf._meta
    batch.info.n = np.ones(buf.maxsize)
    buf.set_batch(batch)
    assert np.allclose(buf.buffers[-1].info.n, [1] * 5)
    assert buf.sample_index(-1).tolist() == []
    assert np.array([ReplayBuffer(0, ignore_obs_next=True)]).dtype == np.object
예제 #13
0
def test_hdf5():
    size = 100
    buffers = {
        "array": ReplayBuffer(size, stack_num=2),
        "list": ListReplayBuffer(),
        "prioritized": PrioritizedReplayBuffer(size, 0.6, 0.4),
    }
    buffer_types = {k: b.__class__ for k, b in buffers.items()}
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    info_t = torch.tensor([1.]).to(device)
    for i in range(4):
        kwargs = {
            'obs': Batch(index=np.array([i])),
            'act': i,
            'rew': np.array([1, 2]),
            'done': i % 3 == 2,
            'info': {
                "number": {
                    "n": i,
                    "t": info_t
                },
                'extra': None
            },
        }
        buffers["array"].add(**kwargs)
        buffers["list"].add(**kwargs)
        buffers["prioritized"].add(weight=np.random.rand(), **kwargs)

    # save
    paths = {}
    for k, buf in buffers.items():
        f, path = tempfile.mkstemp(suffix='.hdf5')
        os.close(f)
        buf.save_hdf5(path)
        paths[k] = path

    # load replay buffer
    _buffers = {k: buffer_types[k].load_hdf5(paths[k]) for k in paths.keys()}

    # compare
    for k in buffers.keys():
        assert len(_buffers[k]) == len(buffers[k])
        assert np.allclose(_buffers[k].act, buffers[k].act)
        assert _buffers[k].stack_num == buffers[k].stack_num
        assert _buffers[k].maxsize == buffers[k].maxsize
        assert np.all(_buffers[k]._indices == buffers[k]._indices)
    for k in ["array", "prioritized"]:
        assert _buffers[k]._index == buffers[k]._index
        assert isinstance(buffers[k].get(0, "info"), Batch)
        assert isinstance(_buffers[k].get(0, "info"), Batch)
    for k in ["array"]:
        assert np.all(
            buffers[k][:].info.number.n == _buffers[k][:].info.number.n)
        assert np.all(buffers[k][:].info.extra == _buffers[k][:].info.extra)

    # raise exception when value cannot be pickled
    data = {"not_supported": lambda x: x * x}
    grp = h5py.Group
    with pytest.raises(NotImplementedError):
        to_hdf5(data, grp)
    # ndarray with data type not supported by HDF5 that cannot be pickled
    data = {"not_supported": np.array(lambda x: x * x)}
    grp = h5py.Group
    with pytest.raises(RuntimeError):
        to_hdf5(data, grp)
예제 #14
0
def test_replaybuffer(size=10, bufsize=20):
    env = MyTestEnv(size)
    buf = ReplayBuffer(bufsize)
    buf.update(buf)
    assert str(buf) == buf.__class__.__name__ + '()'
    obs = env.reset()
    action_list = [1] * 5 + [0] * 10 + [1] * 10
    for i, a in enumerate(action_list):
        obs_next, rew, done, info = env.step(a)
        buf.add(obs, [a], rew, done, obs_next, info)
        obs = obs_next
        assert len(buf) == min(bufsize, i + 1)
    with pytest.raises(ValueError):
        buf._add_to_buffer('rew', np.array([1, 2, 3]))
    assert buf.act.dtype == np.object
    assert isinstance(buf.act[0], list)
    data, indice = buf.sample(bufsize * 2)
    assert (indice < len(buf)).all()
    assert (data.obs < size).all()
    assert (0 <= data.done).all() and (data.done <= 1).all()
    b = ReplayBuffer(size=10)
    # neg bsz should return empty index
    assert b.sample_index(-1).tolist() == []
    b.add(1, 1, 1, 1, 'str', {'a': 3, 'b': {'c': 5.0}})
    assert b.obs[0] == 1
    assert b.done[0]
    assert b.obs_next[0] == 'str'
    assert np.all(b.obs[1:] == 0)
    assert np.all(b.obs_next[1:] == np.array(None))
    assert b.info.a[0] == 3 and b.info.a.dtype == np.integer
    assert np.all(b.info.a[1:] == 0)
    assert b.info.b.c[0] == 5.0 and b.info.b.c.dtype == np.inexact
    assert np.all(b.info.b.c[1:] == 0.0)
    with pytest.raises(IndexError):
        b[22]
    b = ListReplayBuffer()
    with pytest.raises(NotImplementedError):
        b.sample(0)
예제 #15
0
def test_ddpg(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(args.layer_num, args.state_shape, args.action_shape,
                  args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = Critic(args.layer_num, args.state_shape, args.action_shape,
                    args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(actor,
                        actor_optim,
                        critic,
                        critic_optim,
                        args.tau,
                        args.gamma,
                        args.exploration_noise,
                        [env.action_space.low[0], env.action_space.high[0]],
                        reward_normalization=True,
                        ignore_done=True)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
예제 #16
0
def test_c51(args=get_args()):
    env = make_atari_env(args)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    # make environments
    train_envs = SubprocVectorEnv(
        [lambda: make_atari_env(args) for _ in range(args.training_num)])
    test_envs = SubprocVectorEnv(
        [lambda: make_atari_env_watch(args) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # define model
    net = C51(*args.state_shape, args.action_shape, args.num_atoms,
              args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    # define policy
    policy = C51Policy(net,
                       optim,
                       args.gamma,
                       args.num_atoms,
                       args.v_min,
                       args.v_max,
                       args.n_step,
                       target_update_freq=args.target_update_freq).to(
                           args.device)
    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)
    # replay buffer: `save_last_obs` and `stack_num` can be removed together
    # when you have enough RAM
    buffer = ReplayBuffer(args.buffer_size,
                          ignore_obs_next=True,
                          save_only_last_obs=True,
                          stack_num=args.frames_stack)
    # collector
    train_collector = Collector(policy, train_envs, buffer)
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'c51')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        if env.env.spec.reward_threshold:
            return mean_rewards >= env.spec.reward_threshold
        elif 'Pong' in args.task:
            return mean_rewards >= 20
        else:
            return False

    def train_fn(epoch, env_step):
        # nature DQN setting, linear decay in the first 1M steps
        if env_step <= 1e6:
            eps = args.eps_train - env_step / 1e6 * \
                (args.eps_train - args.eps_train_final)
        else:
            eps = args.eps_train_final
        policy.set_eps(eps)
        writer.add_scalar('train/eps', eps, global_step=env_step)

    def test_fn(epoch, env_step):
        policy.set_eps(args.eps_test)

    # watch agent's performance
    def watch():
        print("Testing agent ...")
        policy.eval()
        policy.set_eps(args.eps_test)
        test_envs.seed(args.seed)
        test_collector.reset()
        result = test_collector.collect(n_episode=[1] * args.test_num,
                                        render=args.render)
        pprint.pprint(result)

    if args.watch:
        watch()
        exit(0)

    # test train_collector and start filling replay buffer
    train_collector.collect(n_step=args.batch_size * 4)
    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer,
                               test_in_train=False)

    pprint.pprint(result)
    watch()
예제 #17
0
def test_a2c_with_il(args=get_args()):
    torch.set_num_threads(1)  # for poor CPU
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Categorical
    policy = A2CPolicy(actor,
                       critic,
                       optim,
                       dist,
                       args.gamma,
                       gae_lambda=args.gae_lambda,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       max_grad_norm=args.max_grad_norm,
                       reward_normalization=args.rew_norm)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'a2c')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              save_fn=save_fn,
                              writer=writer)
    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')

    policy.eval()
    # here we define an imitation collector with a trivial policy
    if args.task == 'CartPole-v0':
        env.spec.reward_threshold = 190  # lower the goal
    net = Net(1, args.state_shape, device=args.device)
    net = Actor(net, args.action_shape).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.il_lr)
    il_policy = ImitationPolicy(net, optim, mode='discrete')
    il_test_collector = Collector(
        il_policy,
        DummyVectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.test_num)]))
    train_collector.reset()
    result = offpolicy_trainer(il_policy,
                               train_collector,
                               il_test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        il_policy.eval()
        collector = Collector(il_policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
예제 #18
0
def test_ddpg(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    args.exploration_noise = args.exploration_noise * args.max_action
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    print("Action range:", np.min(env.action_space.low),
          np.max(env.action_space.high))
    # train_envs = gym.make(args.task)
    if args.training_num > 1:
        train_envs = SubprocVectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.training_num)])
    else:
        train_envs = gym.make(args.task)
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net_a = Net(args.state_shape,
                hidden_sizes=args.hidden_sizes,
                device=args.device)
    actor = Actor(net_a,
                  args.action_shape,
                  max_action=args.max_action,
                  device=args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c = Net(args.state_shape,
                args.action_shape,
                hidden_sizes=args.hidden_sizes,
                concat=True,
                device=args.device)
    critic = Critic(net_c, device=args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor,
        actor_optim,
        critic,
        critic_optim,
        tau=args.tau,
        gamma=args.gamma,
        exploration_noise=GaussianNoise(sigma=args.exploration_noise),
        estimation_step=args.n_step,
        action_space=env.action_space)

    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)

    # collector
    if args.training_num > 1:
        buffer = VectorReplayBuffer(args.buffer_size, len(train_envs))
    else:
        buffer = ReplayBuffer(args.buffer_size)
    train_collector = Collector(policy,
                                train_envs,
                                buffer,
                                exploration_noise=True)
    test_collector = Collector(policy, test_envs)
    train_collector.collect(n_step=args.start_timesteps, random=True)
    # log
    t0 = datetime.datetime.now().strftime("%m%d_%H%M%S")
    log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_ddpg'
    log_path = os.path.join(args.logdir, args.task, 'ddpg', log_file)
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = TensorboardLogger(writer)

    def save_best_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    if not args.watch:
        # trainer
        result = offpolicy_trainer(policy,
                                   train_collector,
                                   test_collector,
                                   args.epoch,
                                   args.step_per_epoch,
                                   args.step_per_collect,
                                   args.test_num,
                                   args.batch_size,
                                   save_best_fn=save_best_fn,
                                   logger=logger,
                                   update_per_step=args.update_per_step,
                                   test_in_train=False)
        pprint.pprint(result)

    # Let's watch its performance!
    policy.eval()
    test_envs.seed(args.seed)
    test_collector.reset()
    result = test_collector.collect(n_episode=args.test_num,
                                    render=args.render)
    print(
        f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}'
    )
예제 #19
0
파일: test_pg.py 프로젝트: wu-yy/tianshou
def test_pg(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num,
              args.state_shape,
              args.action_shape,
              device=args.device,
              softmax=True)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PGPolicy(net,
                      optim,
                      dist,
                      args.gamma,
                      reward_normalization=args.rew_norm)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'pg')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              save_fn=save_fn,
                              writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
예제 #20
0
def test_collector_with_dict_state():
    env = MyTestEnv(size=5, sleep=0, dict_state=True)
    policy = MyPolicy(dict_state=True)
    c0 = Collector(policy, env, ReplayBuffer(size=100),
                   Logger.single_preprocess_fn)
    c0.collect(n_step=3)
    c0.collect(n_episode=2)
    assert len(c0.buffer) == 10
    env_fns = [
        lambda x=i: MyTestEnv(size=x, sleep=0, dict_state=True)
        for i in [2, 3, 4, 5]
    ]
    envs = DummyVectorEnv(env_fns)
    envs.seed(666)
    obs = envs.reset()
    assert not np.isclose(obs[0]['rand'], obs[1]['rand'])
    c1 = Collector(policy, envs,
                   VectorReplayBuffer(total_size=100, buffer_num=4),
                   Logger.single_preprocess_fn)
    c1.collect(n_step=12)
    result = c1.collect(n_episode=8)
    assert result['n/ep'] == 8
    lens = np.bincount(result['lens'])
    assert result['n/st'] == 21 and np.all(lens == [0, 0, 2, 2, 2, 2]) or \
        result['n/st'] == 20 and np.all(lens == [0, 0, 3, 1, 2, 2])
    batch, _ = c1.buffer.sample(10)
    c0.buffer.update(c1.buffer)
    assert len(c0.buffer) in [42, 43]
    if len(c0.buffer) == 42:
        assert np.all(c0.buffer[:].obs.index[..., 0] == [
            0,
            1,
            2,
            3,
            4,
            0,
            1,
            2,
            3,
            4,
            0,
            1,
            0,
            1,
            0,
            1,
            0,
            1,
            0,
            1,
            2,
            0,
            1,
            2,
            0,
            1,
            2,
            3,
            0,
            1,
            2,
            3,
            0,
            1,
            2,
            3,
            4,
            0,
            1,
            2,
            3,
            4,
        ]), c0.buffer[:].obs.index[..., 0]
    else:
        assert np.all(c0.buffer[:].obs.index[..., 0] == [
            0,
            1,
            2,
            3,
            4,
            0,
            1,
            2,
            3,
            4,
            0,
            1,
            0,
            1,
            0,
            1,
            0,
            1,
            2,
            0,
            1,
            2,
            0,
            1,
            2,
            0,
            1,
            2,
            3,
            0,
            1,
            2,
            3,
            0,
            1,
            2,
            3,
            4,
            0,
            1,
            2,
            3,
            4,
        ]), c0.buffer[:].obs.index[..., 0]
    c2 = Collector(
        policy, envs,
        VectorReplayBuffer(total_size=100, buffer_num=4, stack_num=4),
        Logger.single_preprocess_fn)
    c2.collect(n_episode=10)
    batch, _ = c2.buffer.sample(10)
예제 #21
0
def test_sac(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    train_envs = gym.make(args.task)
    # train_envs = VectorEnv(
    #     [lambda: gym.make(args.task) for _ in range(args.training_num)])
    test_envs = gym.make(args.task)
    # test_envs = SubprocVectorEnv(
    #     [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num,
                      args.state_shape,
                      args.action_shape,
                      args.max_action,
                      args.device,
                      unbounded=True).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = SACPolicy(actor,
                       actor_optim,
                       critic1,
                       critic1_optim,
                       critic2,
                       critic2_optim,
                       args.tau,
                       args.gamma,
                       args.alpha,
                       [env.action_space.low[0], env.action_space.high[0]],
                       reward_normalization=args.rew_norm,
                       ignore_done=True)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'sac')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
예제 #22
0
def test_collector_with_ma():
    env = MyTestEnv(size=5, sleep=0, ma_rew=4)
    policy = MyPolicy()
    c0 = Collector(policy, env, ReplayBuffer(size=100),
                   Logger.single_preprocess_fn)
    # n_step=3 will collect a full episode
    r = c0.collect(n_step=3)['rews']
    assert len(r) == 0
    r = c0.collect(n_episode=2)['rews']
    assert r.shape == (2, 4) and np.all(r == 1)
    env_fns = [
        lambda x=i: MyTestEnv(size=x, sleep=0, ma_rew=4) for i in [2, 3, 4, 5]
    ]
    envs = DummyVectorEnv(env_fns)
    c1 = Collector(policy, envs,
                   VectorReplayBuffer(total_size=100, buffer_num=4),
                   Logger.single_preprocess_fn)
    r = c1.collect(n_step=12)['rews']
    assert r.shape == (2, 4) and np.all(r == 1), r
    r = c1.collect(n_episode=8)['rews']
    assert r.shape == (8, 4) and np.all(r == 1)
    batch, _ = c1.buffer.sample(10)
    print(batch)
    c0.buffer.update(c1.buffer)
    assert len(c0.buffer) in [42, 43]
    if len(c0.buffer) == 42:
        rew = [
            0,
            0,
            0,
            0,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            1,
            0,
            1,
            0,
            1,
            0,
            1,
            0,
            0,
            1,
            0,
            0,
            1,
            0,
            0,
            0,
            1,
            0,
            0,
            0,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            0,
            0,
            0,
            1,
        ]
    else:
        rew = [
            0,
            0,
            0,
            0,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            1,
            0,
            1,
            0,
            1,
            0,
            0,
            1,
            0,
            0,
            1,
            0,
            0,
            1,
            0,
            0,
            0,
            1,
            0,
            0,
            0,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            0,
            0,
            0,
            1,
        ]
    assert np.all(c0.buffer[:].rew == [[x] * 4 for x in rew])
    assert np.all(c0.buffer[:].done == rew)
    c2 = Collector(
        policy, envs,
        VectorReplayBuffer(total_size=100, buffer_num=4, stack_num=4),
        Logger.single_preprocess_fn)
    r = c2.collect(n_episode=10)['rews']
    assert r.shape == (10, 4) and np.all(r == 1)
    batch, _ = c2.buffer.sample(10)
예제 #23
0
def test_td3(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(args.layer_num, args.state_shape, args.action_shape,
                  args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = TD3Policy(actor,
                       actor_optim,
                       critic1,
                       critic1_optim,
                       critic2,
                       critic2_optim,
                       args.tau,
                       args.gamma,
                       args.exploration_noise,
                       args.policy_noise,
                       args.update_actor_freq,
                       args.noise_clip,
                       [env.action_space.low[0], env.action_space.high[0]],
                       reward_normalization=True,
                       ignore_done=True)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'td3')

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               writer=writer,
                               task=args.task)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
예제 #24
0
def test_collector_with_atari_setting():
    reference_obs = np.zeros([6, 4, 84, 84])
    for i in range(6):
        reference_obs[i, 3, np.arange(84), np.arange(84)] = i
        reference_obs[i, 2, np.arange(84)] = i
        reference_obs[i, 1, :, np.arange(84)] = i
        reference_obs[i, 0] = i

    # atari single buffer
    env = MyTestEnv(size=5, sleep=0, array_state=True)
    policy = MyPolicy()
    c0 = Collector(policy, env, ReplayBuffer(size=100))
    c0.collect(n_step=6)
    c0.collect(n_episode=2)
    assert c0.buffer.obs.shape == (100, 4, 84, 84)
    assert c0.buffer.obs_next.shape == (100, 4, 84, 84)
    assert len(c0.buffer) == 15
    obs = np.zeros_like(c0.buffer.obs)
    obs[np.arange(15)] = reference_obs[np.arange(15) % 5]
    assert np.all(obs == c0.buffer.obs)

    c1 = Collector(policy, env, ReplayBuffer(size=100, ignore_obs_next=True))
    c1.collect(n_episode=3)
    assert np.allclose(c0.buffer.obs, c1.buffer.obs)
    with pytest.raises(AttributeError):
        c1.buffer.obs_next
    assert np.all(reference_obs[[1, 2, 3, 4, 4] * 3] == c1.buffer[:].obs_next)

    c2 = Collector(
        policy, env,
        ReplayBuffer(size=100, ignore_obs_next=True, save_only_last_obs=True))
    c2.collect(n_step=8)
    assert c2.buffer.obs.shape == (100, 84, 84)
    obs = np.zeros_like(c2.buffer.obs)
    obs[np.arange(8)] = reference_obs[[0, 1, 2, 3, 4, 0, 1, 2], -1]
    assert np.all(c2.buffer.obs == obs)
    assert np.allclose(c2.buffer[:].obs_next,
                       reference_obs[[1, 2, 3, 4, 4, 1, 2, 2], -1])

    # atari multi buffer
    env_fns = [
        lambda x=i: MyTestEnv(size=x, sleep=0, array_state=True)
        for i in [2, 3, 4, 5]
    ]
    envs = DummyVectorEnv(env_fns)
    c3 = Collector(policy, envs,
                   VectorReplayBuffer(total_size=100, buffer_num=4))
    c3.collect(n_step=12)
    result = c3.collect(n_episode=9)
    assert result["n/ep"] == 9 and result["n/st"] == 23
    assert c3.buffer.obs.shape == (100, 4, 84, 84)
    obs = np.zeros_like(c3.buffer.obs)
    obs[np.arange(8)] = reference_obs[[0, 1, 0, 1, 0, 1, 0, 1]]
    obs[np.arange(25, 34)] = reference_obs[[0, 1, 2, 0, 1, 2, 0, 1, 2]]
    obs[np.arange(50, 58)] = reference_obs[[0, 1, 2, 3, 0, 1, 2, 3]]
    obs[np.arange(75, 85)] = reference_obs[[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]]
    assert np.all(obs == c3.buffer.obs)
    obs_next = np.zeros_like(c3.buffer.obs_next)
    obs_next[np.arange(8)] = reference_obs[[1, 2, 1, 2, 1, 2, 1, 2]]
    obs_next[np.arange(25, 34)] = reference_obs[[1, 2, 3, 1, 2, 3, 1, 2, 3]]
    obs_next[np.arange(50, 58)] = reference_obs[[1, 2, 3, 4, 1, 2, 3, 4]]
    obs_next[np.arange(75, 85)] = reference_obs[[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]]
    assert np.all(obs_next == c3.buffer.obs_next)
    c4 = Collector(
        policy, envs,
        VectorReplayBuffer(total_size=100,
                           buffer_num=4,
                           stack_num=4,
                           ignore_obs_next=True,
                           save_only_last_obs=True))
    c4.collect(n_step=12)
    result = c4.collect(n_episode=9)
    assert result["n/ep"] == 9 and result["n/st"] == 23
    assert c4.buffer.obs.shape == (100, 84, 84)
    obs = np.zeros_like(c4.buffer.obs)
    slice_obs = reference_obs[:, -1]
    obs[np.arange(8)] = slice_obs[[0, 1, 0, 1, 0, 1, 0, 1]]
    obs[np.arange(25, 34)] = slice_obs[[0, 1, 2, 0, 1, 2, 0, 1, 2]]
    obs[np.arange(50, 58)] = slice_obs[[0, 1, 2, 3, 0, 1, 2, 3]]
    obs[np.arange(75, 85)] = slice_obs[[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]]
    assert np.all(c4.buffer.obs == obs)
    obs_next = np.zeros([len(c4.buffer), 4, 84, 84])
    ref_index = np.array([
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        2,
        2,
        1,
        2,
        2,
        1,
        2,
        2,
        1,
        2,
        3,
        3,
        1,
        2,
        3,
        3,
        1,
        2,
        3,
        4,
        4,
        1,
        2,
        3,
        4,
        4,
    ])
    obs_next[:, -1] = slice_obs[ref_index]
    ref_index -= 1
    ref_index[ref_index < 0] = 0
    obs_next[:, -2] = slice_obs[ref_index]
    ref_index -= 1
    ref_index[ref_index < 0] = 0
    obs_next[:, -3] = slice_obs[ref_index]
    ref_index -= 1
    ref_index[ref_index < 0] = 0
    obs_next[:, -4] = slice_obs[ref_index]
    assert np.all(obs_next == c4.buffer[:].obs_next)

    buf = ReplayBuffer(100,
                       stack_num=4,
                       ignore_obs_next=True,
                       save_only_last_obs=True)
    c5 = Collector(policy, envs, CachedReplayBuffer(buf, 4, 10))
    result_ = c5.collect(n_step=12)
    assert len(buf) == 5 and len(c5.buffer) == 12
    result = c5.collect(n_episode=9)
    assert result["n/ep"] == 9 and result["n/st"] == 23
    assert len(buf) == 35
    assert np.all(buf.obs[:len(buf)] == slice_obs[[
        0, 1, 0, 1, 2, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 0, 1, 2, 0, 1, 0,
        1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4
    ]])
    assert np.all(buf[:].obs_next[:, -1] == slice_obs[[
        1, 1, 1, 2, 2, 1, 1, 1, 2, 3, 3, 1, 2, 3, 4, 4, 1, 1, 1, 2, 2, 1, 1, 1,
        2, 3, 3, 1, 2, 2, 1, 2, 3, 4, 4
    ]])
    assert len(buf) == len(c5.buffer)

    # test buffer=None
    c6 = Collector(policy, envs)
    result1 = c6.collect(n_step=12)
    for key in ["n/ep", "n/st", "rews", "lens"]:
        assert np.allclose(result1[key], result_[key])
    result2 = c6.collect(n_episode=9)
    for key in ["n/ep", "n/st", "rews", "lens"]:
        assert np.allclose(result2[key], result[key])
예제 #25
0
def test_dqn(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    Q_param = {"hidden_sizes": args.dueling_q_hidden_sizes}
    V_param = {"hidden_sizes": args.dueling_v_hidden_sizes}
    net = Net(args.state_shape,
              args.action_shape,
              hidden_sizes=args.hidden_sizes,
              device=args.device,
              dueling_param=(Q_param, V_param)).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(net,
                       optim,
                       args.gamma,
                       args.n_step,
                       target_update_freq=args.target_update_freq)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'dqn')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= env.spec.reward_threshold

    def train_fn(epoch, env_step):
        if env_step <= 100000:
            policy.set_eps(args.eps_train)
        elif env_step <= 500000:
            eps = args.eps_train - (env_step - 100000) / \
                400000 * (0.5 * args.eps_train)
            policy.set_eps(eps)
        else:
            policy.set_eps(0.5 * args.eps_train)

    def test_fn(epoch, env_step):
        policy.set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)

    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        policy.eval()
        policy.set_eps(args.eps_test)
        test_envs.seed(args.seed)
        test_collector.reset()
        result = test_collector.collect(n_episode=[1] * args.test_num,
                                        render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
예제 #26
0
def test_collector():
    writer = SummaryWriter('log/collector')
    logger = Logger(writer)
    env_fns = [lambda x=i: MyTestEnv(size=x, sleep=0) for i in [2, 3, 4, 5]]

    venv = SubprocVectorEnv(env_fns)
    dum = DummyVectorEnv(env_fns)
    policy = MyPolicy()
    env = env_fns[0]()
    c0 = Collector(policy, env, ReplayBuffer(size=100), logger.preprocess_fn)
    c0.collect(n_step=3)
    assert len(c0.buffer) == 3
    assert np.allclose(c0.buffer.obs[:4, 0], [0, 1, 0, 0])
    assert np.allclose(c0.buffer[:].obs_next[..., 0], [1, 2, 1])
    c0.collect(n_episode=3)
    assert len(c0.buffer) == 8
    assert np.allclose(c0.buffer.obs[:10, 0], [0, 1, 0, 1, 0, 1, 0, 1, 0, 0])
    assert np.allclose(c0.buffer[:].obs_next[..., 0], [1, 2, 1, 2, 1, 2, 1, 2])
    c0.collect(n_step=3, random=True)
    c1 = Collector(policy, venv,
                   VectorReplayBuffer(total_size=100, buffer_num=4),
                   logger.preprocess_fn)
    c1.collect(n_step=8)
    obs = np.zeros(100)
    obs[[0, 1, 25, 26, 50, 51, 75, 76]] = [0, 1, 0, 1, 0, 1, 0, 1]

    assert np.allclose(c1.buffer.obs[:, 0], obs)
    assert np.allclose(c1.buffer[:].obs_next[..., 0], [1, 2, 1, 2, 1, 2, 1, 2])
    c1.collect(n_episode=4)
    assert len(c1.buffer) == 16
    obs[[2, 3, 27, 52, 53, 77, 78, 79]] = [0, 1, 2, 2, 3, 2, 3, 4]
    assert np.allclose(c1.buffer.obs[:, 0], obs)
    assert np.allclose(c1.buffer[:].obs_next[..., 0],
                       [1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5])
    c1.collect(n_episode=4, random=True)
    c2 = Collector(policy, dum, VectorReplayBuffer(total_size=100,
                                                   buffer_num=4),
                   logger.preprocess_fn)
    c2.collect(n_episode=7)
    obs1 = obs.copy()
    obs1[[4, 5, 28, 29, 30]] = [0, 1, 0, 1, 2]
    obs2 = obs.copy()
    obs2[[28, 29, 30, 54, 55, 56, 57]] = [0, 1, 2, 0, 1, 2, 3]
    c2obs = c2.buffer.obs[:, 0]
    assert np.all(c2obs == obs1) or np.all(c2obs == obs2)
    c2.reset_env()
    c2.reset_buffer()
    assert c2.collect(n_episode=8)['n/ep'] == 8
    obs[[4, 5, 28, 29, 30, 54, 55, 56, 57]] = [0, 1, 0, 1, 2, 0, 1, 2, 3]
    assert np.all(c2.buffer.obs[:, 0] == obs)
    c2.collect(n_episode=4, random=True)

    # test corner case
    with pytest.raises(TypeError):
        Collector(policy, dum, ReplayBuffer(10))
    with pytest.raises(TypeError):
        Collector(policy, dum, PrioritizedReplayBuffer(10, 0.5, 0.5))
    with pytest.raises(TypeError):
        c2.collect()

    # test NXEnv
    for obs_type in ["array", "object"]:
        envs = SubprocVectorEnv(
            [lambda i=x: NXEnv(i, obs_type) for x in [5, 10, 15, 20]])
        c3 = Collector(policy, envs,
                       VectorReplayBuffer(total_size=100, buffer_num=4))
        c3.collect(n_step=6)
        assert c3.buffer.obs.dtype == object
예제 #27
0
def test_discrete_sac(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n

    train_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.state_shape,
              hidden_sizes=args.hidden_sizes,
              device=args.device)
    actor = Actor(net,
                  args.action_shape,
                  softmax_output=False,
                  device=args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net_c1 = Net(args.state_shape,
                 hidden_sizes=args.hidden_sizes,
                 device=args.device)
    critic1 = Critic(net_c1, last_size=args.action_shape,
                     device=args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    net_c2 = Net(args.state_shape,
                 hidden_sizes=args.hidden_sizes,
                 device=args.device)
    critic2 = Critic(net_c2, last_size=args.action_shape,
                     device=args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)

    # better not to use auto alpha in CartPole
    if args.auto_alpha:
        target_entropy = 0.98 * np.log(np.prod(args.action_shape))
        log_alpha = torch.zeros(1, requires_grad=True, device=args.device)
        alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
        args.alpha = (target_entropy, log_alpha, alpha_optim)

    policy = DiscreteSACPolicy(actor,
                               actor_optim,
                               critic1,
                               critic1_optim,
                               critic2,
                               critic2_optim,
                               args.tau,
                               args.gamma,
                               args.alpha,
                               reward_normalization=args.rew_norm,
                               ignore_done=args.ignore_done)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'discrete_sac')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer,
                               test_in_train=False)
    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
예제 #28
0
def train_agent(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[dict, BasePolicy]:
    def env_func():
        return TicTacToeEnv(args.board_size, args.win_size)

    train_envs = DummyVectorEnv([env_func for _ in range(args.training_num)])
    test_envs = DummyVectorEnv([env_func for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)

    policy, optim = get_agents(args,
                               agent_learn=agent_learn,
                               agent_opponent=agent_opponent,
                               optim=optim)

    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size)
    # log
    if not hasattr(args, 'writer'):
        log_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn')
        writer = SummaryWriter(log_path)
        args.writer = writer
    else:
        writer = args.writer

    def save_fn(policy):
        if hasattr(args, 'model_save_path'):
            model_save_path = args.model_save_path
        else:
            model_save_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn',
                                           'policy.pth')
        torch.save(policy.policies[args.agent_id - 1].state_dict(),
                   model_save_path)

    def stop_fn(x):
        return x >= args.win_rate

    def train_fn(x):
        policy.policies[args.agent_id - 1].set_eps(args.eps_train)

    def test_fn(x):
        policy.policies[args.agent_id - 1].set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer,
                               test_in_train=False)

    return result, policy.policies[args.agent_id - 1]
예제 #29
0
def test_nstep_returns_with_timelimit(size=10000):
    buf = ReplayBuffer(10)
    for i in range(12):
        buf.add(
            Batch(
                obs=0,
                act=0,
                rew=i + 1,
                done=i % 4 == 3,
                info={"TimeLimit.truncated": i == 3}
            )
        )
    batch, indices = buf.sample(0)
    assert np.allclose(indices, [2, 3, 4, 5, 6, 7, 8, 9, 0, 1])
    # rew:  [11, 12, 3, 4, 5, 6, 7, 8, 9, 10]
    # done: [ 0,  1, 0, 1, 0, 0, 0, 1, 0, 0]
    # test nstep = 1
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=1
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(returns, [2.6, 3.6, 4.4, 5.3, 6.2, 8, 8, 8.9, 9.8, 12])
    r_ = compute_nstep_return_base(1, .1, buf, indices)
    assert np.allclose(returns, r_), (r_, returns)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=1
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 2
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=2
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(returns, [3.36, 3.6, 5.53, 6.62, 7.8, 8, 9.89, 10.98, 12.2, 12])
    r_ = compute_nstep_return_base(2, .1, buf, indices)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=2
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 10
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=10
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(
        returns, [3.36, 3.6, 5.678, 6.78, 7.8, 8, 10.122, 11.22, 12.2, 12]
    )
    r_ = compute_nstep_return_base(10, .1, buf, indices)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=10
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])

    if __name__ == '__main__':
        buf = ReplayBuffer(size)
        for i in range(int(size * 1.5)):
            buf.add(
                Batch(
                    obs=0,
                    act=0,
                    rew=i + 1,
                    done=np.random.randint(3) == 0,
                    info={"TimeLimit.truncated": i % 33 == 0}
                )
            )
        batch, indices = buf.sample(256)

        def vanilla():
            return compute_nstep_return_base(3, .1, buf, indices)

        def optimized():
            return BasePolicy.compute_nstep_return(
                batch, buf, indices, target_q_fn, gamma=.1, n_step=3
            )

        cnt = 3000
        print('nstep vanilla', timeit(vanilla, setup=vanilla, number=cnt))
        print('nstep optim  ', timeit(optimized, setup=optimized, number=cnt))
예제 #30
0
def test_ppo(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.state_shape,
              hidden_sizes=args.hidden_sizes,
              device=args.device)
    actor = ActorProb(net,
                      args.action_shape,
                      max_action=args.max_action,
                      device=args.device).to(args.device)
    critic = Critic(Net(args.state_shape,
                        hidden_sizes=args.hidden_sizes,
                        device=args.device),
                    device=args.device).to(args.device)
    # orthogonal initialization
    for m in list(actor.modules()) + list(critic.modules()):
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.orthogonal_(m.weight)
            torch.nn.init.zeros_(m.bias)
    optim = torch.optim.Adam(set(actor.parameters()).union(
        critic.parameters()),
                             lr=args.lr)

    # replace DiagGuassian with Independent(Normal) which is equivalent
    # pass *logits to be consistent with policy.forward
    def dist(*logits):
        return Independent(Normal(*logits), 1)

    policy = PPOPolicy(
        actor,
        critic,
        optim,
        dist,
        args.gamma,
        max_grad_norm=args.max_grad_norm,
        eps_clip=args.eps_clip,
        vf_coef=args.vf_coef,
        ent_coef=args.ent_coef,
        reward_normalization=args.rew_norm,
        # dual_clip=args.dual_clip,
        # dual clip cause monotonically increasing log_std :)
        value_clip=args.value_clip,
        # action_range=[env.action_space.low[0], env.action_space.high[0]],)
        # if clip the action, ppo would not converge :)
        gae_lambda=args.gae_lambda)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ppo')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              save_fn=save_fn,
                              writer=writer)
    assert stop_fn(result['best_reward'])
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')