def make_model(self, env): n_dim_obs = env.observation_space.low.size n_dim_action = env.action_space.low.size n_hidden_channels = 50 policy = FCBNDeterministicPolicy(n_input_channels=n_dim_obs, n_hidden_layers=2, n_hidden_channels=n_hidden_channels, action_size=n_dim_action, min_action=env.action_space.low, max_action=env.action_space.high, bound_action=True) q_func = FCBNLateActionSAQFunction(n_dim_obs=n_dim_obs, n_dim_action=n_dim_action, n_hidden_layers=2, n_hidden_channels=n_hidden_channels) return DDPGModel(policy=policy, q_func=q_func)
def build_agent() -> DDPG: # observation: # friction on each somite (#somite) # tension on each somite except for both ends (#somite - 2) # cos(somite phases), sin(somites phases) (#oscillator x 2) # cos(gripper phases), sin(gripper phases) (#gripper x 2) obs_size = config.somites + (config.somites - 2) + config.oscillators*2 + config.grippers*2 # actions: feedbacks to somite oscillators, feedbacks to gripper oscillators action_size = config.oscillators + config.grippers q_func = q_functions.FCBNLateActionSAQFunction( obs_size, action_size, n_hidden_channels=6, n_hidden_layers=2, normalize_input=True) pi = policy.FCBNDeterministicPolicy( obs_size, action_size=action_size, n_hidden_channels=6, n_hidden_layers=2, min_action=-F_OUTPUT_BOUND, max_action=F_OUTPUT_BOUND, bound_action=True, normalize_input=True) model = DDPGModel(q_func=q_func, policy=pi) opt_actor = optimizers.Adam() opt_actor.setup(model['policy']) opt_critic = optimizers.Adam() opt_critic.setup(model['q_function']) opt_actor.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_actor') opt_critic.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_critic') rep_buf = replay_buffer.ReplayBuffer(capacity=1 * 10 ** 5) explorer = explorers.AdditiveOU(sigma=OU_SIGMA) phi = lambda x: x.astype(np.float32) agent = DDPG(model, opt_actor, opt_critic, rep_buf, gamma=GAMMA, explorer=explorer, phi=lambda x: x.astype(np.float32), gpu=GPU, replay_start_size=10000) return agent
q_func = q_functions.FCSAQFunction(obs_size, action_size, n_hidden_channels=critic_hidden_units, n_hidden_layers=critic_hidden_layers) pi = policy.FCDeterministicPolicy(obs_size, action_size=action_size, n_hidden_channels=actor_hidden_units, n_hidden_layers=actor_hidden_layers, min_action=action_space.low, max_action=action_space.high, bound_action=True) # The Model model = DDPGModel(q_func=q_func, policy=pi) opt_actor = optimizers.Adam(alpha=actor_lr) opt_critic = optimizers.Adam(alpha=critic_lr) opt_actor.setup(model['policy']) opt_critic.setup(model['q_function']) opt_actor.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a') opt_critic.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c') rbuf = replay_buffer.ReplayBuffer(replay_buffer_size) ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) # The agent agent = DDPG(model, opt_actor,
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='Humanoid-v2') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=10**6) parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) parser.add_argument('--load', type=str, default='') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument('--n-hidden-channels', type=int, default=300) parser.add_argument('--n-hidden-layers', type=int, default=3) parser.add_argument('--replay-start-size', type=int, default=5000) parser.add_argument('--n-update-times', type=int, default=1) parser.add_argument('--target-update-interval', type=int, default=1) parser.add_argument('--target-update-method', type=str, default='soft', choices=['hard', 'soft']) parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=4) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--gamma', type=float, default=0.995) parser.add_argument('--minibatch-size', type=int, default=200) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') parser.add_argument('--use-bn', action='store_true', default=False) parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e-2) args = parser.parse_args() args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def reward_filter(r): return r * args.reward_scale_factor def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and not test: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_size = np.asarray(env.observation_space.shape).prod() action_space = env.action_space action_size = np.asarray(action_space.shape).prod() if args.use_bn: q_func = q_functions.FCBNLateActionSAQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, normalize_input=True) pi = policy.FCBNDeterministicPolicy( obs_size, action_size=action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, min_action=action_space.low, max_action=action_space.high, bound_action=True, normalize_input=True) else: q_func = q_functions.FCSAQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) pi = policy.FCDeterministicPolicy( obs_size, action_size=action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, min_action=action_space.low, max_action=action_space.high, bound_action=True) model = DDPGModel(q_func=q_func, policy=pi) opt_a = optimizers.Adam(alpha=args.actor_lr) opt_c = optimizers.Adam(alpha=args.critic_lr) opt_a.setup(model['policy']) opt_c.setup(model['q_function']) opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a') opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c') rbuf = replay_buffer.ReplayBuffer(5 * 10**5) def random_action(): a = action_space.sample() if isinstance(a, np.ndarray): a = a.astype(np.float32) return a ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) agent = DDPG(model, opt_a, opt_c, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_method=args.target_update_method, target_update_interval=args.target_update_interval, update_interval=args.update_interval, soft_update_tau=args.soft_update_tau, n_times_update=args.n_update_times, gpu=args.gpu, minibatch_size=args.minibatch_size) if len(args.load) > 0: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_env=eval_env, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, train_max_episode_len=timestep_limit)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='out') parser.add_argument('--env', type=str, default='Humanoid-v1') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=10**6) parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) parser.add_argument('--load', type=str, default='') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument('--n-hidden-channels', type=int, default=300) parser.add_argument('--n-hidden-layers', type=int, default=3) parser.add_argument('--replay-start-size', type=int, default=5000) parser.add_argument('--n-update-times', type=int, default=1) parser.add_argument('--target-update-frequency', type=int, default=1) parser.add_argument('--target-update-method', type=str, default='soft', choices=['hard', 'soft']) parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-frequency', type=int, default=4) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-frequency', type=int, default=10**5) parser.add_argument('--gamma', type=float, default=0.995) parser.add_argument('--minibatch-size', type=int, default=200) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') parser.add_argument('--use-bn', action='store_true', default=False) parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e-2) args = parser.parse_args() args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) if args.seed is not None: misc.set_random_seed(args.seed) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def reward_filter(r): return r * args.reward_scale_factor def make_env(): env = gym.make(args.env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) misc.env_modifiers.make_reward_filtered(env, reward_filter) if args.render: misc.env_modifiers.make_rendered(env) def __exit__(self, *args): pass env.__exit__ = __exit__ return env env = make_env() timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_size = np.asarray(env.observation_space.shape).prod() action_space = env.action_space action_size = np.asarray(action_space.shape).prod() if args.use_bn: q_func = q_functions.FCBNLateActionSAQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, normalize_input=True) pi = policy.FCBNDeterministicPolicy( obs_size, action_size=action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, min_action=action_space.low, max_action=action_space.high, bound_action=True, normalize_input=True) else: q_func = q_functions.FCSAQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) pi = policy.FCDeterministicPolicy( obs_size, action_size=action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, min_action=action_space.low, max_action=action_space.high, bound_action=True) model = DDPGModel(q_func=q_func, policy=pi) opt_a = optimizers.Adam(alpha=args.actor_lr) opt_c = optimizers.Adam(alpha=args.critic_lr) opt_a.setup(model['policy']) opt_c.setup(model['q_function']) opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a') opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c') rbuf = replay_buffer.ReplayBuffer(5 * 10**5) def phi(obs): return obs.astype(np.float32) def random_action(): a = action_space.sample() if isinstance(a, np.ndarray): a = a.astype(np.float32) return a ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) agent = DDPG(model, opt_a, opt_c, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_method=args.target_update_method, target_update_frequency=args.target_update_frequency, update_frequency=args.update_frequency, soft_update_tau=args.soft_update_tau, n_times_update=args.n_update_times, phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size) agent.logger.setLevel(logging.DEBUG) if len(args.load) > 0: agent.load(args.load) if args.demo: mean, median, stdev = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev'.format( args.eval_n_runs, mean, median, stdev)) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_frequency=args.eval_frequency, outdir=args.outdir, max_episode_len=timestep_limit)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='Hopper-v2', help='OpenAI Gym MuJoCo env to perform algorithm on.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument('--steps', type=int, default=10**6, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes run for each evaluation.') parser.add_argument('--eval-interval', type=int, default=5000, help='Interval in timesteps between evaluations.') parser.add_argument('--replay-start-size', type=int, default=10000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--batch-size', type=int, default=100, help='Minibatch size') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') args = parser.parse_args() logging.basicConfig(level=args.logger_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) def make_env(test): env = gym.make(args.env) # Unwrap TimiLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if args.render and not test: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space action_space = env.action_space print('Observation space:', obs_space) print('Action space:', action_space) action_size = action_space.low.size winit = chainer.initializers.LeCunUniform(3**-0.5) q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) policy = chainer.Sequential( L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) model = DDPGModel(q_func=q_func, policy=policy) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(model.xp.zeros_like(obs_space.low, dtype=np.float32)[None], name='observation') fake_action = chainer.Variable(model.xp.zeros_like(action_space.low, dtype=np.float32)[None], name='action') chainerrl.misc.draw_computational_graph([policy(fake_obs)], os.path.join( args.outdir, 'policy')) chainerrl.misc.draw_computational_graph([q_func(fake_obs, fake_action)], os.path.join( args.outdir, 'q_func')) opt_a = optimizers.Adam() opt_c = optimizers.Adam() opt_a.setup(model['policy']) opt_c.setup(model['q_function']) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.AdditiveGaussian(scale=0.1, low=action_space.low, high=action_space.high) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = DDPG( model, opt_a, opt_c, rbuf, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_method='soft', target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, ) if len(args.load) > 0: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_env=eval_env, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, train_max_episode_len=timestep_limit)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='FetchPickAndPlace-v1') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=10 ** 6) parser.add_argument('--actor-lr', type=float, default=1e-3) parser.add_argument('--critic-lr', type=float, default=1e-3) parser.add_argument('--load', type=str, default='') parser.add_argument('--steps', type=int, default=200 * 50 * 16 * 50) parser.add_argument('--n-hidden-channels', type=int, default=64) parser.add_argument('--n-hidden-layers', type=int, default=3) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--n-update-times', type=int, default=40) parser.add_argument('--target-update-interval', type=int, default=16 * 50) parser.add_argument('--target-update-method', type=str, default='soft', choices=['hard', 'soft']) parser.add_argument('--soft-update-tau', type=float, default=1 - 0.95) parser.add_argument('--update-interval', type=int, default=16 * 50) parser.add_argument('--eval-n-runs', type=int, default=30) parser.add_argument('--eval-interval', type=int, default=50 * 16 * 50) parser.add_argument('--gamma', type=float, default=0.98) parser.add_argument('--minibatch-size', type=int, default=128) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--epsilon', type=float, default=0.05) parser.add_argument('--noise-std', type=float, default=0.05) parser.add_argument('--clip-threshold', type=float, default=5.0) parser.add_argument('--num-envs', type=int, default=1) args = parser.parse_args() args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 32 def make_env(idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if args.render and not test: env = chainerrl.wrappers.Render(env) if test: env = HEREnvWrapper(env, args.outdir) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv( [(lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs))]) sample_env = make_env(0, test=False) def reward_function(state, action, goal): return sample_env.compute_reward(achieved_goal=state['achieved_goal'], desired_goal=goal, info=None) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') space_dict = sample_env.observation_space.spaces observation_space = space_dict['observation'] goal_space = space_dict['desired_goal'] obs_size = np.asarray(observation_space.shape).prod() goal_size = np.asarray(goal_space.shape).prod() action_space = sample_env.action_space action_size = np.asarray(action_space.shape).prod() q_func = q_functions.FCSAQFunction( obs_size + goal_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) pi = policy.FCDeterministicPolicy( obs_size + goal_size, action_size=action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, min_action=action_space.low, max_action=action_space.high, bound_action=True) model = DDPGModel(q_func=q_func, policy=pi) opt_a = optimizers.Adam(alpha=args.actor_lr) opt_c = optimizers.Adam(alpha=args.critic_lr) opt_a.setup(model['policy']) opt_c.setup(model['q_function']) opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a') opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c') rbuf = replay_buffer.HindsightReplayBuffer(reward_function, 10 ** 6, future_k=4) def phi(dict_state): return np.concatenate( (dict_state['observation'].astype(np.float32, copy=False), dict_state['desired_goal'].astype(np.float32, copy=False)), 0) # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_size + goal_size, clip_threshold=args.clip_threshold) explorer = HERExplorer(args.noise_std, args.epsilon, action_space) agent = DDPG(model, opt_a, opt_c, rbuf, obs_normalizer=obs_normalizer, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, phi=phi, target_update_method=args.target_update_method, target_update_interval=args.target_update_interval, update_interval=args.update_interval, soft_update_tau=args.soft_update_tau, n_times_update=args.n_update_times, gpu=args.gpu, minibatch_size=args.minibatch_size, clip_critic_tgt=(-1.0/(1.0-args.gamma), 0.0)) if len(args.load) > 0: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), steps=args.steps, eval_env=make_batch_env(test=True), eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, max_episode_len=timestep_limit)
def make_agent_ddpg(args, env): obs_size = np.asarray(env.observation_space.shape).prod() action_space = env.action_space action_size = np.asarray(action_space.shape).prod() q_func = FCSAQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) pi = FCDeterministicPolicy( obs_size, action_size=action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, min_action=action_space.low, max_action=action_space.high, bound_action=True) if args.gpu > -1: q_func.to_gpu(args.gpu) pi.to_gpu(args.gpu) else: q_func.to_cpu() pi.to_cpu() model = DDPGModel(q_func=q_func, policy=pi) opt_a = optimizers.Adam(alpha=args.actor_lr) opt_c = optimizers.Adam(alpha=args.critic_lr) opt_a.setup(model['policy']) opt_c.setup(model['q_function']) opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a') opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c') rbuf = replay_buffer.ReplayBuffer(5 * 10 ** 5) def phi(obs): return obs.astype(np.float32) # def random_action(): # a = action_space.sample() # if isinstance(a, np.ndarray): # a = a.astype(np.float32) # return a ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) if args.skip_step == 0: agent = DDPG(model, opt_a, opt_c, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_method=args.target_update_method, target_update_interval=args.target_update_interval, update_interval=args.update_interval, soft_update_tau=args.soft_update_tau, n_times_update=args.n_update_times, phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size) else: agent = DDPGStep(model, opt_a, opt_c, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_method=args.target_update_method, target_update_interval=args.target_update_interval, update_interval=args.update_interval, soft_update_tau=args.soft_update_tau, n_times_update=args.n_update_times, phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size, skip_step=args.skip_step) if args.model_dir is not None: agent.save(args.model_dir) return agent
def main(): # 強化学習のパラメータ gamma = 0.995 num_episodes = 100 #総試行回数 # DDPGセットアップ q_func = QFunction() # Q関数 policy = PolicyNetwork() # ポリシーネットワーク model = DDPGModel(q_func=q_func, policy=policy) optimizer_p = chainer.optimizers.Adam(alpha=1e-4) optimizer_q = chainer.optimizers.Adam(alpha=1e-3) optimizer_p.setup(model['policy']) optimizer_q.setup(model['q_function']) explorer = chainerrl.explorers.AdditiveOU(sigma=1.0) # sigmaで付与するノイズの強さを設定 replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6) phi = lambda x: x.astype(np.float32, copy=False) agent = DDPG(model, optimizer_p, optimizer_q, replay_buffer, gamma=gamma, explorer=explorer, replay_start_size=1000, target_update_method='soft', target_update_interval=1, update_interval=4, soft_update_tau=0.01, n_times_update=1, phi=phi, gpu=-1, minibatch_size=200) def reward_filter(r): # 報酬値を小さくする(0〜1の範囲になるようにする) return r * 0.01 outdir = 'result' chainerrl.misc.set_random_seed(0) env = gym.make('SpaceInvaders-v0') #スペースインベーダーの環境呼び出し env.seed(0) chainerrl.misc.env_modifiers.make_reward_filtered(env, reward_filter) env = gym.wrappers.Monitor(env, outdir) # 動画を保存 # エピソードの試行&強化学習スタート for episode in range(1, num_episodes + 1): #試行数分繰り返す done = False reward = 0 n_steps = 0 total_reward = 0 obs = env.reset() obs = np.asarray(obs.transpose(2, 0, 1), dtype=np.float32) while not done: action = agent.act_and_train(obs, reward) # actionは連続値 action = F.argmax(action).data # 出力値が最大の行動を選択 obs, reward, done, info = env.step(action) # actionを実行 total_reward += reward n_steps += 1 obs = np.asarray(obs.transpose(2, 0, 1), dtype=np.float32) print('{0:4d}: action {1}, reward {2}, done? {3}, {4}'.format( n_steps, action, reward, done, info)) agent.stop_episode_and_train(obs, reward, done) print('Episode {0:4d}: total reward {1}, n_steps {2}, statistics: {3}'. format(episode, total_reward, n_steps, agent.get_statistics())) if episode % 10 == 0: agent.save('agent_DDPG_spaceinvaders_' + str(episode))
def _test_load_ddpg(self, gpu): def concat_obs_and_action(obs, action): return F.concat((obs, action), axis=-1) action_size = 3 winit = chainer.initializers.LeCunUniform(3**-0.5) q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) policy = chainer.Sequential( L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) from chainerrl.agents.ddpg import DDPGModel model = DDPGModel(q_func=q_func, policy=policy) obs_low = [-np.inf] * 11 fake_obs = chainer.Variable(model.xp.zeros_like( obs_low, dtype=np.float32)[None], name='observation') fake_action = chainer.Variable(model.xp.zeros_like( [-1., -1., -1.], dtype=np.float32)[None], name='action') policy(fake_obs) q_func(fake_obs, fake_action) opt_a = optimizers.Adam() opt_c = optimizers.Adam() opt_a.setup(model['policy']) opt_c.setup(model['q_function']) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1., -1., -1.], high=[1., 1., 1.]) agent = agents.DDPG(model, opt_a, opt_c, replay_buffer.ReplayBuffer(100), gamma=0.99, explorer=explorer, replay_start_size=1000, target_update_method='soft', target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=gpu, minibatch_size=100, burnin_action_func=None) model, exists = download_model("DDPG", "Hopper-v2", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists