Exemplo n.º 1
0
def main(do_render: bool, seed: int, as_gdads: bool, name: str,
         do_train: bool):
    drop_abs_position = True

    conf: Conf = CONFS[name]
    dict_env = get_env(name=name,
                       drop_abs_position=drop_abs_position,
                       is_training=True)
    if as_gdads:
        flat_env = SkillWrapper(env=dict_env)
    else:
        flat_env = flatten_env(dict_env, drop_abs_position)
    flat_env = TransformReward(flat_env, f=lambda r: r * conf.reward_scaling)
    flat_env = Monitor(flat_env)

    dict_env = get_env(name=name,
                       drop_abs_position=drop_abs_position,
                       is_training=False)
    if as_gdads:
        use_slider = False
        if use_slider:
            eval_env = SliderWrapper(env=dict_env)
        else:
            eval_env = GDADSEvalWrapper(dict_env,
                                        sw=BestSkillProvider(flat_env))
    else:
        eval_env = flatten_env(dict_env=dict_env,
                               drop_abs_position=drop_abs_position)

    filename = f"modelsCommandSkills/{name}/asGDADS{as_gdads}/resamplingFalse_goalSpaceTrue-seed-{seed}"
    if os.path.exists(filename + ".zip"):
        sac = SAC.load(filename + ".zip", env=flat_env)
        print(f"loaded model {filename}")
        if as_gdads:
            flat_env.load(filename)
    else:
        sac = SAC("MlpPolicy",
                  env=flat_env,
                  verbose=1,
                  learning_rate=conf.lr,
                  tensorboard_log=filename,
                  buffer_size=conf.buffer_size,
                  batch_size=conf.batch_size,
                  gamma=gamma(conf.ep_len),
                  learning_starts=100 * conf.ep_len,
                  policy_kwargs=dict(log_std_init=-3,
                                     net_arch=[conf.layer_size] * 2),
                  seed=seed,
                  device="cuda",
                  train_freq=4)
    if do_train:
        train(model=sac, conf=conf, save_fname=filename, eval_env=eval_env)
    if do_render:
        show(model=sac, env=eval_env, conf=conf)
    do_eval = not do_train and not do_render
    if do_eval:
        results = ant_grid_evaluation(model=sac,
                                      env=eval_env,
                                      episode_len=conf.ep_len)
        dump_ant_grid_evaluation(results)
Exemplo n.º 2
0
def wrap(env):
    # Applying normalisation of observations
    wrapper_observation = NormalizeObservationSpace(
        env, lambda o: o / env.unwrapped.observation_space.high)
    # Applying normalisation of rewards
    wrapper_reward = TransformReward(wrapper_observation, lambda r: 1.e0 * r)
    return wrapper_reward
Exemplo n.º 3
0
def atari_wrapper(env):
    # This is substantially the same CNN as in (Mnih et al., 2016; 2015),
    # the only difference is that in the pre-processing stage
    # we retain all colour channels.
    env = AtariPreprocessing(env, grayscale_obs=False, scale_obs=True)
    env = ReturnWrapper(env)
    env = TransformReward(env, lambda r: np.sign(r))
    return env
Exemplo n.º 4
0
def snake_wrapper(env,
               time_limit = 200, 
               default_reward=0,
               stack_length=3):
    env = TransformReward(env,reward_wrapper_func(default_reward))
    env = TimeLimit(env,time_limit)
    env = SnakeStack(env,stack_length)
    return env
Exemplo n.º 5
0
def make_env(env_name):

    if env_name == 'fourrooms':
        return Fourrooms(), False

    env = gym.make(env_name)
    is_atari = hasattr(gym.envs, 'atari') and isinstance(env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
    if is_atari:
        env = AtariPreprocessing(env, grayscale_obs=True, scale_obs=True, terminal_on_life_loss=True)
        env = TransformReward(env, lambda r: np.clip(r, -1, 1))
        env = FrameStack(env, 4)
    return env, is_atari
Exemplo n.º 6
0
def make_env(env_id, seed, reward_noise_scale=None):
    try:
        env = gym.make(env_id)
    except gym.error.UnregisteredEnv:
        register(id=env_id,
                 entry_point=ALL_V1_ENVIRONMENTS[env_id],
                 max_episode_steps=150)
        print("Registered env", env_id)
        env = gym.make(env_id)
        assert_env(env)
    env.seed(seed)
    setattr(env, 'is_metaworld', env_id in ALL_V1_ENVIRONMENTS.keys())
    if reward_noise_scale:
        from gym.wrappers import TransformReward
        max_step_bk = env._max_episode_steps
        env = TransformReward(env, lambda r: r + reward_noise_scale * randn())
        setattr(env, "_max_episode_steps", max_step_bk)
    return env
Exemplo n.º 7
0
def main():
    as_gdads = True
    name = "pointmass"
    drop_abs_position = True

    dads_env_fn = envs_fns[name]
    conf: Conf = CONFS[name]

    dict_env = as_dict_env(dads_env_fn())
    dict_env = TimeLimit(dict_env, max_episode_steps=conf.ep_len)
    if drop_abs_position:
        dict_env = DropGoalEnvsAbsoluteLocation(dict_env)
    if as_gdads:
        flat_env = SkillWrapper(env=dict_env, skill_reset_steps=conf.ep_len // 2)
    else:
        flat_obs_content = ["observation", "desired_goal", "achieved_goal"]
        if drop_abs_position:
            flat_obs_content.remove("achieved_goal")  # Because always 0 vector
        flat_env = FlattenObservation(FilterObservation(dict_env, filter_keys=flat_obs_content))

    flat_env = TransformReward(flat_env, f=lambda r: r*conf.reward_scaling)
    flat_env = Monitor(flat_env)

    filename = f"modelsCommandSkills/{name}-gdads{as_gdads}"
    if os.path.exists(filename + ".zip"):
        sac = SAC.load(filename, env=flat_env)
        if as_gdads:
            flat_env.load(filename)
    else:
        sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr,
                  tensorboard_log=f"{filename}-tb", buffer_size=10000)
        train(model=sac, conf=conf, save_fname=filename)
        if as_gdads:
            flat_env.save(filename)

    if as_gdads:
        flat_env.set_sac(sac)
        eval_dict_env(dict_env=dict_env,
                      model=flat_env,
                      ep_len=conf.ep_len)
    show(model=sac, env=flat_env, conf=conf)
Exemplo n.º 8
0
    def __init__(self,
                 game,
                 stack=False,
                 sticky_action=False,
                 clip_reward=False,
                 terminal_on_life_loss=False,
                 **kwargs):
        # set action_probability=0.25 if sticky_action=True
        env_id = '{}NoFrameskip-v{}'.format(game, 0 if sticky_action else 4)

        # use official atari wrapper
        env = AtariPreprocessing(gym.make(env_id),
                                 terminal_on_life_loss=terminal_on_life_loss)

        if stack:
            env = FrameStack(env, num_stack=4)

        if clip_reward:
            env = TransformReward(env, lambda r: np.clip(r, -1.0, 1.0))

        self._env = env

        self.observation_space = env.observation_space
        self.action_space = env.action_space
Exemplo n.º 9
0
def test_transform_reward(env_id):
    # use case #1: scale
    scales = [0.1, 200]
    for scale in scales:
        env = gym.make(env_id)
        wrapped_env = TransformReward(gym.make(env_id), lambda r: scale * r)
        action = env.action_space.sample()

        env.reset(seed=0)
        wrapped_env.reset(seed=0)

        _, reward, _, _ = env.step(action)
        _, wrapped_reward, _, _ = wrapped_env.step(action)

        assert wrapped_reward == scale * reward
    del env, wrapped_env

    # use case #2: clip
    min_r = -0.0005
    max_r = 0.0002
    env = gym.make(env_id)
    wrapped_env = TransformReward(gym.make(env_id),
                                  lambda r: np.clip(r, min_r, max_r))
    action = env.action_space.sample()

    env.reset(seed=0)
    wrapped_env.reset(seed=0)

    _, reward, _, _ = env.step(action)
    _, wrapped_reward, _, _ = wrapped_env.step(action)

    assert abs(wrapped_reward) < abs(reward)
    assert wrapped_reward == -0.0005 or wrapped_reward == 0.0002
    del env, wrapped_env

    # use case #3: sign
    env = gym.make(env_id)
    wrapped_env = TransformReward(gym.make(env_id), lambda r: np.sign(r))

    env.reset(seed=0)
    wrapped_env.reset(seed=0)

    for _ in range(1000):
        action = env.action_space.sample()
        _, wrapped_reward, done, _ = wrapped_env.step(action)
        assert wrapped_reward in [-1.0, 0.0, 1.0]
        if done:
            break
    del env, wrapped_env
Exemplo n.º 10
0
def test_sac():
    args, log_path, writer = get_args()
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = ShmPipeVecEnv([
        lambda: TransformReward(BipedalWrapper(gym.make(args.task)), lambda
                                reward: 5 * reward)
        for _ in range(args.training_num)
    ])
    # test_envs = gym.make(args.task)
    test_envs = ShmPipeVecEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed + 1)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = DQCritic(args.layer_num, args.state_shape, args.action_shape,
                      args.device).to(args.device)
    critic_target = DQCritic(args.layer_num, args.state_shape,
                             args.action_shape, args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = SACPolicy(actor,
                       actor_optim,
                       critic,
                       critic_optim,
                       critic_target,
                       env.action_space,
                       args.device,
                       args.tau,
                       args.gamma,
                       args.alpha,
                       reward_normalization=args.rew_norm,
                       ignore_done=False)

    if args.mode == 'test':
        policy.load_state_dict(
            torch.load("{}/{}/{}/policy.pth".format(args.logdir, args.task,
                                                    args.comment),
                       map_location=args.device))
        env = gym.make(args.task)
        collector = Collector(policy, env
                              # Monitor(env, 'video', force=True)
                              )
        result = collector.collect(n_episode=10, render=args.render)
        print(
            f'Final reward: {result["ep/reward"]}, length: {result["ep/len"]}')
        collector.close()
        exit()
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    train_collector.collect(10000, sampling=True)
    test_collector = Collector(policy, test_envs)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold + 5

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_episode,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])

    pprint.pprint(result)
Exemplo n.º 11
0
 def __init__(self, env: gym.Env, is_eval: bool = False):
     env = AtariPreprocessing(env, terminal_on_life_loss=not is_eval)
     if not is_eval:
         env = TransformReward(env, lambda r: np.clip(r, -1.0, 1.0))
     super().__init__(ChannelFirst(env))
Exemplo n.º 12
0
def basic_wrapper(env):
    """Use this as a wrapper only for cartpole etc."""
    env = ReturnWrapper(env)
    env = TransformReward(env, lambda r: np.clip(r, -1, 1))
    return env
Exemplo n.º 13
0
import gym
import numpy as np

from gym.wrappers import AtariPreprocessing, TransformReward
from d3rlpy.algos import DoubleDQN
from d3rlpy.models.optimizers import AdamFactory
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from d3rlpy.envs import ChannelFirst

# get wrapped atari environment
env = ChannelFirst(
    TransformReward(
        AtariPreprocessing(gym.make('BreakoutNoFrameskip-v4'),
                           terminal_on_life_loss=True),
        lambda r: np.clip(r, -1.0, 1.0)))

eval_env = ChannelFirst(AtariPreprocessing(gym.make('BreakoutNoFrameskip-v4')))

# setup algorithm
dqn = DoubleDQN(batch_size=32,
                learning_rate=2.5e-4,
                optim_factory=AdamFactory(eps=1e-2 / 32),
                target_update_interval=10000,
                q_func_factory='mean',
                scaler='pixel',
                n_frames=4,
                use_gpu=True)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=1000000, env=env)
Exemplo n.º 14
0
def test_sac_with_il(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = SubprocVectorEnv([
        lambda: TransformReward(BipedalWrapper(gym.make(args.task)), lambda
                                reward: 5 * reward)
        for _ in range(args.training_num)
    ])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = SACPolicy(actor,
                       actor_optim,
                       critic1,
                       critic1_optim,
                       critic2,
                       critic2_optim,
                       env.action_space,
                       args.tau,
                       args.gamma,
                       args.alpha,
                       reward_normalization=args.rew_norm,
                       ignore_done=True)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    train_collector.collect(10000, sampling=True)
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'sac')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    # test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(
            f'Final reward: {result["ep/reward"]}, length: {result["ep/len"]}')
        collector.close()

    # here we define an imitation collector with a trivial policy
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -300  # lower the goal
    net = Actor(1, args.state_shape, args.action_shape, args.max_action,
                args.device).to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.il_lr)
    il_policy = ImitationPolicy(net, optim, mode='continuous')
    il_test_collector = Collector(il_policy, test_envs)
    train_collector.reset()
    result = offpolicy_trainer(il_policy,
                               train_collector,
                               il_test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    il_test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(il_policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(
            f'Final reward: {result["ep/reward"]}, length: {result["ep/len"]}')
        collector.close()