def create_agent(self, env): model = agents.ddpg.DDPGModel( policy=create_stochastic_policy_for_env(env), q_func=create_state_action_q_function_for_env(env)) rbuf = replay_buffer.ReplayBuffer(10 ** 5) opt_a = optimizers.Adam() opt_a.setup(model.policy) opt_b = optimizers.Adam() opt_b.setup(model.q_function) explorer = explorers.AdditiveGaussian(scale=1) return agents.PGT(model, opt_a, opt_b, rbuf, gamma=0.99, explorer=explorer)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='Hopper-v2', help='OpenAI Gym MuJoCo env to perform algorithm on.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument('--steps', type=int, default=10**6, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes run for each evaluation.') parser.add_argument('--eval-interval', type=int, default=5000, help='Interval in timesteps between evaluations.') parser.add_argument('--replay-start-size', type=int, default=10000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--batch-size', type=int, default=100, help='Minibatch size') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') args = parser.parse_args() logging.basicConfig(level=args.logger_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) def make_env(test): env = gym.make(args.env) # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space action_space = env.action_space print('Observation space:', obs_space) print('Action space:', action_space) action_size = action_space.low.size winit = chainer.initializers.LeCunUniform(3**-0.5) policy = chainer.Sequential( L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) policy_optimizer = optimizers.Adam().setup(policy) def make_q_func_with_optimizer(): q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) q_func_optimizer = optimizers.Adam().setup(q_func) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None], name='observation') fake_action = chainer.Variable(policy.xp.zeros_like( action_space.low, dtype=np.float32)[None], name='action') chainerrl.misc.draw_computational_graph([policy(fake_obs)], os.path.join( args.outdir, 'policy')) chainerrl.misc.draw_computational_graph([q_func1(fake_obs, fake_action)], os.path.join( args.outdir, 'q_func1')) chainerrl.misc.draw_computational_graph([q_func2(fake_obs, fake_action)], os.path.join( args.outdir, 'q_func2')) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.AdditiveGaussian(scale=0.1, low=action_space.low, high=action_space.high) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = chainerrl.agents.TD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, ) if len(args.load) > 0: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_env=eval_env, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, train_max_episode_len=timestep_limit)
def _test_load_ddpg(self, gpu): def concat_obs_and_action(obs, action): return F.concat((obs, action), axis=-1) action_size = 3 winit = chainer.initializers.LeCunUniform(3**-0.5) q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) policy = chainer.Sequential( L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) from chainerrl.agents.ddpg import DDPGModel model = DDPGModel(q_func=q_func, policy=policy) obs_low = [-np.inf] * 11 fake_obs = chainer.Variable(model.xp.zeros_like( obs_low, dtype=np.float32)[None], name='observation') fake_action = chainer.Variable(model.xp.zeros_like( [-1., -1., -1.], dtype=np.float32)[None], name='action') policy(fake_obs) q_func(fake_obs, fake_action) opt_a = optimizers.Adam() opt_c = optimizers.Adam() opt_a.setup(model['policy']) opt_c.setup(model['q_function']) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1., -1., -1.], high=[1., 1., 1.]) agent = agents.DDPG(model, opt_a, opt_c, replay_buffer.ReplayBuffer(100), gamma=0.99, explorer=explorer, replay_start_size=1000, target_update_method='soft', target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=gpu, minibatch_size=100, burnin_action_func=None) model, exists = download_model("DDPG", "Hopper-v2", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def _test_load_td3(self, gpu): def concat_obs_and_action(obs, action): """Concat observation and action to feed the critic.""" return F.concat((obs, action), axis=-1) def make_q_func_with_optimizer(): q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) q_func_optimizer = optimizers.Adam().setup(q_func) return q_func, q_func_optimizer winit = chainer.initializers.LeCunUniform(3**-0.5) q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() action_size = 3 policy = chainer.Sequential( L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) policy_optimizer = optimizers.Adam().setup(policy) rbuf = replay_buffer.ReplayBuffer(100) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1., -1., -1.], high=[1., 1., 1.]) agent = agents.TD3(policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=10000, gpu=gpu, minibatch_size=100, burnin_action_func=None) model, exists = download_model("TD3", "Hopper-v2", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='Inv', help='OpenAI Gym MuJoCo env to perform algorithm on.') parser.add_argument( '--outdir', type=str, default='results', help= 'Directory path to save output files. it will be created if not existent.' ) parser.add_argument('--seed', type=int, default=420, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=-1, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument('--steps', type=int, default=10**5, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes run for each evaluation.') parser.add_argument('--eval-interval', type=int, default=100, help='Interval in timesteps between evaluations.') parser.add_argument( '--replay-start-size', type=int, default=1000, help='Minimum replay buffer size before performing gradient updates.') parser.add_argument('--batch-size', type=int, default=4, help='Minibatch size') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') args = parser.parse_args() logging.basicConfig(level=args.logger_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) def make_env(test): env = gym.make( "DaktyPushingSimulationEnv-v0", level=5, simulation_backend="mujoco", control_frequency_in_hertz=100, state_space_components_to_be_used=None, alternate_env_object=None, discretization_factor_torque_control_space=None, model_as_function_for_pixel_to_latent_space_parsing=(None, None)) print('\n############\n', env, '\n############\n') env.unwrapped.finger.set_resolution_quality('low') print('\n############\n', env, '\n############\n') env = gym.wrappers.TimeLimit(env) print('\n############\n', env, '\n############\n') # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space action_space = env.action_space print('Observation space:', obs_space) print('Action space:', action_space) action_size = action_space.low.size winit = chainer.initializers.LeCunUniform(3**-0.5) ''' define policy and optimiser output_dim = action_size ''' policy = chainer.Sequential( L.Linear(None, 128, initialW=winit), F.relu, L.Linear(None, 64, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) policy_optimizer = optimizers.Adam(3e-4).setup(policy) # policy.to_gpu(0) ''' define q-function and optimiser output_dim = 1 defines 2 identical q_functions with resp. optimisers ''' def make_q_func_with_optimizer(): q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 128, initialW=winit), F.relu, L.Linear(None, 64, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) q_func_optimizer = optimizers.Adam().setup(q_func) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() # q_func1.to_gpu(0) # q_func2.to_gpu(0) print('\n\n-------------------\n', obs_space.low.shape, '\n-------------------\n') # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None], name='observation') fake_action = chainer.Variable(policy.xp.zeros_like( action_space.low, dtype=np.float32)[None], name='action') chainerrl.misc.draw_computational_graph([policy(fake_obs)], os.path.join( args.outdir, 'policy')) chainerrl.misc.draw_computational_graph([q_func1(fake_obs, fake_action)], os.path.join( args.outdir, 'q_func1')) chainerrl.misc.draw_computational_graph([q_func2(fake_obs, fake_action)], os.path.join( args.outdir, 'q_func2')) rbuf = replay_buffer.ReplayBuffer(10**5) explorer = explorers.AdditiveGaussian(scale=0.1, low=action_space.low, high=action_space.high) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = chainerrl.agents.TD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, ) # agent.to_gpu(0) if len(args.load) > 0: agent.load(args.load) sys.stdout.flush() print('\nbeginning training\n') n_episodes = 10000 # pbar = tqdm(total=n_episodes) max_episode_len = 5000 for i in range(1, n_episodes + 1): # pbar.update(1) obs = env.reset() # print('obs inital..............', obs.shape) reward = 0 done = False R = 0 # return (sum of rewards) t = 0 # time step pbar = tqdm(total=max_episode_len) while not done and t < max_episode_len: pbar.update(1) # Uncomment to watch the behaviour # env.render() action = agent.act_and_train(obs, reward) # print('action..................', action) obs, reward, done, _ = env.step(action) # print('obs.....................', obs) # print('reward..................', reward) R += reward t += 1 if i % 1 == 0: print('episode:', i, 'R:', R, 'statistics:', agent.get_statistics()) agent.stop_episode_and_train(obs, reward, done) print('Finished.')