def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 1 nonlinearity = F.leaky_relu replay_buffer = EpisodicReplayBuffer(10**4) if use_lstm: if discrete: model = acer.ACERSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: if discrete: model = acer.ACERSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) eps = 1e-8 opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.5 beta = 1e-5 if self.n_times_replay == 0 and self.disable_online_update: # At least one of them must be enabled return agent = acer.ACER(model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, beta=beta, phi=phi, n_times_replay=self.n_times_replay, act_deterministically=True, disable_online_update=self.disable_online_update, replay_start_size=100, use_trust_region=self.use_trust_region) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=50) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--replay-capacity', type=int, default=5000) parser.add_argument('--replay-start-size', type=int, default=10**3) parser.add_argument('--disable-online-update', action='store_true') parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--truncation-threshold', type=float, default=5) parser.add_argument('--trust-region-delta', type=float, default=0.1) args = parser.parse_args() logging.basicConfig(level=args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space if isinstance(action_space, spaces.Box): model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), adv=q_functions.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels // 4, n_hidden_layers=args.n_hidden_layers), ) else: model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(args.replay_capacity) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, disable_online_update=args.disable_online_update, use_trust_region=True, trust_region_delta=args.trust_region_delta, truncation_threshold=args.truncation_threshold, beta=args.beta, phi=phi) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=50) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--replay-capacity', type=int, default=5000) parser.add_argument('--replay-start-size', type=int, default=10**3) parser.add_argument('--disable-online-update', action='store_true') parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--truncation-threshold', type=float, default=5) parser.add_argument('--trust-region-delta', type=float, default=0.1) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space if isinstance(action_space, spaces.Box): model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), adv=q_functions.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels // 4, n_hidden_layers=args.n_hidden_layers), ) else: model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(args.replay_capacity) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, disable_online_update=args.disable_online_update, use_trust_region=True, trust_region_delta=args.trust_region_delta, truncation_threshold=args.truncation_threshold, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space if isinstance(action_space, spaces.Box): model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), adv=q_functions.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels // 4, n_hidden_layers=args.n_hidden_layers), ) else: model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(args.replay_capacity) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, disable_online_update=args.disable_online_update, use_trust_region=True, trust_region_delta=args.trust_region_delta, truncation_threshold=args.truncation_threshold, beta=args.beta) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, log_type=args.log_type, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt env = make_env(0, True) frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.save_mp4) return anim