def set_explorer(env): # Possible parameters: # Initial (and max) value of epsilon at the start of the experimentation. start_epsilon = 1.0 # Minimum value of epsilon end_epsilon = 0.1 # Constant epsilon cons_epsilon = 0.001 # how many steps it takes for epsilon to decay final_exploration_steps = 10**5 # Options for exploration (more explorers at site-packages/chainerrl/explorers/) # Option 1: Constant constant_epsilon_explorer = explorers.ConstantEpsilonGreedy( epsilon=cons_epsilon, random_action_func=env.action_space.sample) # Option 2: Linear decay decay_epsilon_explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon, end_epsilon, final_exploration_steps, random_action_func=str(env.action_space.sample)) return constant_epsilon_explorer
def create_agent(self, env): model = create_state_q_function_for_env(env) rbuf = replay_buffer.ReplayBuffer(10**5) opt = optimizers.Adam() opt.setup(model) explorer = explorers.ConstantEpsilonGreedy( 0.2, random_action_func=lambda: env.action_space.sample()) return agents.DQN(model, opt, rbuf, gamma=0.99, explorer=explorer)
def make_chainer_dqn(obs_size, action_space): q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, action_space.n, 50, 1) explorer = explorers.ConstantEpsilonGreedy(0.1, action_space.sample) opt = optimizers.Adam(eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**5) agent = DQN(q_func, opt, rbuf, explorer=explorer, gamma=0.9) return agent
def create_agent(self, env): model = create_state_q_function_for_env(env) opt = optimizers.Adam() opt.setup(model) explorer = explorers.ConstantEpsilonGreedy( 0.2, random_action_func=lambda: env.action_space.sample()) return agents.NSQ(q_function=model, optimizer=opt, t_max=1, gamma=0.99, i_target=100, explorer=explorer)
n_actions = action_space.n #q_func = q_functions.FCStateQFunctionWithDiscreteAction( #obs_size, n_actions, #n_hidden_channels=n_hidden_channels, #n_hidden_layers=n_hidden_layers #) q_func = QFunction( obs_size, n_actions, ) # Use epsilon-greedy for exploration # Constant explorer = explorers.ConstantEpsilonGreedy( epsilon = 0.3, random_action_func=env.action_space.sample ) # Linear decay #explorer = explorers.LinearDecayEpsilonGreedy( #start_epsilon, #end_epsilon, #final_exploration_steps, #random_action_func=str(env.action_space.sample) #) # Set up Adam optimizer opt = optimizers.Adam() opt.setup(q_func) # DQN uses Experience Replay.
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=10**6) parser.add_argument('--final-epsilon', type=float, default=0.1) parser.add_argument('--eval-epsilon', type=float, default=0.05) parser.add_argument('--arch', type=str, default='nature', choices=['nature', 'nips', 'dueling']) parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--replay-start-size', type=int, default=5 * 10**4) parser.add_argument('--target-update-interval', type=int, default=10**4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--update-interval', type=int, default=4) parser.add_argument('--activation', type=str, default='relu') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--no-clip-delta', dest='clip_delta', action='store_false') parser.set_defaults(clip_delta=True) parser.add_argument('--agent', type=str, default='DQN', choices=['DQN', 'DoubleDQN', 'PAL']) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: misc.env_modifiers.make_rendered(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n activation = parse_activation(args.activation) q_func = parse_arch(args.arch, n_actions, activation) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator='sum', phi=phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # In testing DQN, randomly select 5% of actions eval_explorer = explorers.ConstantEpsilonGreedy( args.eval_epsilon, lambda: np.random.randint(n_actions)) experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_explorer=eval_explorer, save_best_so_far_agent=False, max_episode_len=args.max_episode_len, eval_env=eval_env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('rom', type=str) parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=10**6) parser.add_argument('--final-epsilon', type=float, default=0.1) parser.add_argument('--eval-epsilon', type=float, default=0.05) parser.add_argument('--steps', type=int, default=10**7) parser.add_argument('--replay-start-size', type=int, default=5 * 10**4) parser.add_argument('--target-update-interval', type=int, default=10**4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--update-interval', type=int, default=4) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--batch-size', type=int, default=32) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) # In training, life loss is considered as terminal states env = ale.ALE(args.rom, use_sdl=args.use_sdl, seed=train_seed) misc.env_modifiers.make_reward_clipped(env, -1, 1) # In testing, an episode is terminated when all lives are lost eval_env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=False, seed=test_seed) n_actions = env.number_of_actions n_atoms = 51 v_max = 10 v_min = -10 q_func = chainerrl.links.Sequence( chainerrl.links.NatureDQNHead(), chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction( None, n_actions, n_atoms, v_min, v_max, n_hidden_channels=0, n_hidden_layers=0), ) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = chainer.optimizers.Adam(2.5e-4, eps=1e-2 / args.batch_size) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) agent = chainerrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='mean', phi=dqn_phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # In testing DQN, randomly select 5% of actions eval_explorer = explorers.ConstantEpsilonGreedy( args.eval_epsilon, lambda: np.random.randint(n_actions)) experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_explorer=eval_explorer, save_best_so_far_agent=False, eval_env=eval_env)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=dqn_phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, eval_explorer=explorer, global_step_hooks=[lr_decay_hook])
def main(): # This prevents numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging # logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='nsq_output') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=dqn_phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, eval_explorer=explorer, global_step_hooks=[lr_decay_hook])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=10**6) parser.add_argument('--final-epsilon', type=float, default=0.1) parser.add_argument('--eval-epsilon', type=float, default=0.05) parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--replay-start-size', type=int, default=5 * 10**4) parser.add_argument('--target-update-interval', type=int, default=10**4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--update-interval', type=int, default=4) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--batch-size', type=int, default=32) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: misc.env_modifiers.make_rendered(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = chainerrl.links.Sequence( chainerrl.links.NatureDQNHead(), chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction( None, n_actions, n_atoms, v_min, v_max, n_hidden_channels=0, n_hidden_layers=0), ) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = chainer.optimizers.Adam(2.5e-4, eps=1e-2 / args.batch_size) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = chainerrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='mean', phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # In testing DQN, randomly select 5% of actions eval_explorer = explorers.ConstantEpsilonGreedy( args.eval_epsilon, lambda: np.random.randint(n_actions)) experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_explorer=eval_explorer, save_best_so_far_agent=False, max_episode_len=args.max_episode_len, eval_env=eval_env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('rom', type=str) parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=10**6) parser.add_argument('--final-epsilon', type=float, default=0.1) parser.add_argument('--eval-epsilon', type=float, default=0.05) parser.add_argument('--arch', type=str, default='nature', choices=['nature', 'nips', 'dueling']) parser.add_argument('--steps', type=int, default=10**7) parser.add_argument('--replay-start-size', type=int, default=5 * 10**4) parser.add_argument('--target-update-interval', type=int, default=10**4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--update-interval', type=int, default=4) parser.add_argument('--activation', type=str, default='relu') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--no-clip-delta', dest='clip_delta', action='store_false') parser.set_defaults(clip_delta=True) parser.add_argument('--agent', type=str, default='DQN', choices=['DQN', 'DoubleDQN', 'PAL']) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) # In training, life loss is considered as terminal states env = ale.ALE(args.rom, use_sdl=args.use_sdl, seed=train_seed) misc.env_modifiers.make_reward_clipped(env, -1, 1) # In testing, an episode is terminated when all lives are lost eval_env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=False, seed=test_seed) n_actions = env.number_of_actions activation = parse_activation(args.activation) q_func = parse_arch(args.arch, n_actions, activation) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) Agent = parse_agent(args.agent) agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator='sum', phi=dqn_phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # In testing DQN, randomly select 5% of actions eval_explorer = explorers.ConstantEpsilonGreedy( args.eval_epsilon, lambda: np.random.randint(n_actions)) experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_explorer=eval_explorer, save_best_so_far_agent=False, eval_env=eval_env)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('rom', type=str) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=10 ** 6) parser.add_argument('--model', type=str, default='') parser.add_argument('--arch', type=str, default='nature', choices=['nature', 'nips', 'dueling']) parser.add_argument('--steps', type=int, default=10 ** 7) parser.add_argument('--replay-start-size', type=int, default=5 * 10 ** 4) parser.add_argument('--target-update-frequency', type=int, default=10 ** 4) parser.add_argument('--eval-frequency', type=int, default=10 ** 5) parser.add_argument('--update-frequency', type=int, default=4) parser.add_argument('--activation', type=str, default='relu') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--no-clip-delta', dest='clip_delta', action='store_false') parser.set_defaults(clip_delta=True) parser.add_argument('--agent', type=str, default='DQN', choices=['DQN', 'DoubleDQN', 'PAL']) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) # In training, life loss is considered as terminal states env = ale.ALE(args.rom, use_sdl=args.use_sdl) misc.env_modifiers.make_reward_clipped(env, -1, 1) # In testing, an episode is terminated when all lives are lost eval_env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=False) n_actions = env.number_of_actions activation = parse_activation(args.activation) q_func = parse_arch(args.arch, n_actions, activation) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves( lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10 ** 6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, 0.1, args.final_exploration_frames, lambda: np.random.randint(n_actions)) Agent = parse_agent(args.agent) agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_frequency=args.target_update_frequency, clip_delta=args.clip_delta, update_frequency=args.update_frequency, batch_accumulator='sum', phi=dqn_phi) if args.load: agent.load(args.load) if args.demo: mean, median, stdev = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev'.format( args.eval_n_runs, mean, median, stdev)) else: # In testing DQN, randomly select 5% of actions eval_explorer = explorers.ConstantEpsilonGreedy( 5e-2, lambda: np.random.randint(n_actions)) experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_frequency=args.eval_frequency, outdir=args.outdir, eval_explorer=eval_explorer, eval_env=eval_env)
def main(): parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: misc.env_modifiers.make_rendered(env) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, eval_explorer=explorer, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
'adam': optimizers.Adam(alpha=1e-3), 'adadelta': optimizers.AdaDelta(rho=0.95) } available_explorers = { 'boltzmann': explorers.Boltzmann(T=1.0), 'lin_decay_eps_greedy': explorers.LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.0, decay_steps=10, random_action_func=None, logger=None), 'const_eps_greedy': explorers.ConstantEpsilonGreedy(epsilon=1.0, random_action_func=None, logger=None) } class BaseAgent(object): def __init__(self, env, feature_transformer, gamma=0.99, optimizer='adadelta', explorer='boltzmann'): self.actions = dict([(a, i) for (i, a) in enumerate(env.actions_available)]) self.n_actions = len(self.actions) self.n_dims = feature_transformer.dimensions