def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 n_actions = gym.make(args.env).action_space.n if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=phi) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, ) elif (args.mode == 'check'): return tools.make_video.check(env=make_env_check(), agent=agent, save_mp4=args.save_mp4) elif (args.mode == 'growth'): return tools.make_video.growth(env=make_env_check(), agent=agent, outdir=args.outdir, max_num=args.max_frames, save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one process (i.e. processes > 1), # the results will be no longer be deterministic # even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 n_actions = gym.make(args.env).action_space.n if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_frequency, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, log_type=args.log_type) elif (args.mode == 'check'): return tools.make_video.check(env=make_env_check(), agent=agent, save_mp4=args.save_mp4) elif (args.mode == 'growth'): return tools.make_video.growth(env=make_env_check(), agent=agent, outdir=args.outdir, max_num=args.max_frames, save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space if isinstance(action_space, spaces.Box): model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), adv=q_functions.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels // 4, n_hidden_layers=args.n_hidden_layers), ) else: model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(args.replay_capacity) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, disable_online_update=args.disable_online_update, use_trust_region=True, trust_region_delta=args.trust_region_delta, truncation_threshold=args.truncation_threshold, beta=args.beta) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, log_type=args.log_type, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt env = make_env(0, True) frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.save_mp4) return anim
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(args): import logging logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use async training (--train-async), the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed, gpus=(args.gpu, )) if args.train_async: # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs if args.train_async: process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed else: env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Box): model = chainerrl.agents.pcl.PCLSeparateModel( pi=chainerrl.policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, var_wscale=1e-3, var_bias=1, var_type='diagonal', ), v=chainerrl.v_functions.FCVFunction( obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, )) else: model = chainerrl.agents.pcl.PCLSeparateModel( pi=chainerrl.policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), v=chainerrl.v_functions.FCVFunction( obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ), ) if not args.train_async and args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu(args.gpu) if args.train_async: opt = rmsprop_async.RMSpropAsync(lr=args.lr, alpha=0.99) else: opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) if args.prioritized_replay: replay_buffer = \ chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer( capacity=5 * 10 ** 3, uniform_ratio=0.1, default_priority_func=exp_return_of_episode, wait_priority_after_sampling=False, return_sample_weights=False) else: replay_buffer = chainerrl.replay_buffer.EpisodicReplayBuffer( capacity=5 * 10**3) agent = chainerrl.agents.PCL( model, opt, replay_buffer=replay_buffer, t_max=args.t_max, gamma=0.99, tau=args.tau, rollout_len=args.rollout_len, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, batchsize=args.batchsize, train_async=args.train_async, disable_online_update=args.disable_online_update, backprop_future_values=args.backprop_future_values, ) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): if args.train_async: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, step_offset=args.step_offset, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit) if args.save_agent: agent.save(args.save_agent) else: experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, test=False), eval_env=make_env(0, test=True), outdir=args.outdir, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.outdir + '/test.mp4') return anim