def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2 ** 31 - 1 - args.seed def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = chainerrl.agents.iqn.ImplicitQuantileQFunction( psi=chainerrl.links.Sequence( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, functools.partial(F.reshape, shape=(-1, 3136)), ), phi=chainerrl.links.Sequence( chainerrl.agents.iqn.CosineBasisLinear(64, 3136), F.relu, ), f=chainerrl.links.Sequence( L.Linear(None, 512), F.relu, L.Linear(None, n_actions), ), ) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros((4, 84, 84), dtype=np.float32)[None] fake_taus = np.zeros(32, dtype=np.float32)[None] chainerrl.misc.draw_computational_graph( [q_func(fake_obss)(fake_taus)], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1710.10044 opt = chainer.optimizers.Adam(5e-5, eps=1e-2 / args.batch_size) opt.setup(q_func) if args.prioritized: betasteps = args.steps / args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( 10 ** 6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=args.num_step_return) else: rbuf = replay_buffer.ReplayBuffer( 10 ** 6, num_steps=args.num_step_return) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator=args.batch_accumulator, phi=phi, quantile_thresholds_N=args.quantile_thresholds_N, quantile_thresholds_N_prime=args.quantile_thresholds_N_prime, quantile_thresholds_K=args.quantile_thresholds_K, ) if args.load_agent: agent.load(args.load_agent) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, checkpoint_freq=args.checkpoint_frequency, step_offset=args.step_offset, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, log_type=args.log_type ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None) with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f: # temporary hack to handle python 2/3 support issues. # json dumps does not support non-string literal dict keys json_stats = json.dumps(stats) print(str(json_stats), file=f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat])) elif (args.mode=='check'): return tools.make_video.check(env=env,agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=env,agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not os.path.exists(args.outdir): os.makedirs(args.outdir) print('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) env = chainerrl.wrappers.CastObservationToFloat32(env) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): print("Use NAF to apply DQN to continuous action spaces") action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: print("not continuous action spaces") n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DoubleDQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load_agent: agent.load(args.load_agent) eval_env = make_env(test=True) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=timestep_limit, log_type=args.log_type ) elif (args.mode=='check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode = 'rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0),dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),interval=50) anim.save(args.save_mp4) return anim
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) # atari env = atari_wrappers.FireResetEnvAuto(env) print("Set FireResetEnvAuto") if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) print(env.action_space) return env env = make_env(test=False) obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space n_actions = action_space.n q_func = chainerrl.links.Sequence( chainerrl.links.NatureDQNHead(), L.Linear(512, n_actions), chainerrl.action_value.DiscreteActionValue) # Use the same hyper parameters as the Nature paper's optimizer = chainer.optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) optimizer.setup(q_func) rbuf = chainerrl.replay_buffer.ReplayBuffer(10 ** 6) explorer = chainerrl.explorers.LinearDecayEpsilonGreedy( 1.0, 0.1, 10 ** 6, lambda: np.random.randint(n_actions)) def dqn_phi(screens): assert len(screens) == 4 assert screens[0].dtype == np.uint8 raw_values = np.asarray(screens, dtype=np.float32) # [0,255] -> [0, 1] raw_values /= 255.0 return raw_values agent = chainerrl.agents.DQN(q_func, optimizer, rbuf, gpu=0, gamma=0.99, explorer=explorer, replay_start_size=5 * 10 ** 4, target_update_interval=10 ** 4, clip_delta=True, update_interval=4, batch_accumulator='sum', phi=dqn_phi) if args.load_agent: agent.load(args.load_agent) eval_env = make_env(test=True) if (args.mode=='train'): def step_hook(env, agent, step): sys.stdout.write("\r{} / {} steps.".format(step, STEPS)) sys.stdout.flush() experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=args.max_frames, eval_max_episode_len=args.max_frames, log_type=args.log_type, step_hooks=[step_hook] ) elif (args.mode=='check'): return tools.make_video.check(env=env,agent=agent,max_num=args.max_frames,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=env,agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and not test: env = chainerrl.wrappers.Render(env) return env train_env = make_env(test=False) timestep_limit = train_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = train_env.observation_space action_space = train_env.action_space # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Box): model = chainerrl.policies.FCGaussianPolicyWithFixedCovariance( obs_space.low.size, action_space.low.size, var=0.1, n_hidden_channels=200, n_hidden_layers=2, nonlinearity=chainer.functions.leaky_relu, ) else: model = chainerrl.policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=200, n_hidden_layers=2, nonlinearity=chainer.functions.leaky_relu, ) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [model(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu(args.gpu) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(1)) agent = chainerrl.agents.REINFORCE( model, opt, beta=args.beta, batchsize=args.batchsize) eval_env = make_env(test=True) if args.load_agent: agent.load(args.load_agent) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=train_env, eval_env=eval_env, outdir=args.outdir, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit, log_type=args.log_type ) elif (args.mode=='check'): from matplotlib import animation import matplotlib.pyplot as plt env=make_env(test=True) frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode = 'rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0),dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),interval=50) anim.save(args.save_mp4) return anim
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2 ** 31 - 1 - args.seed def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=not args.no_frame_stack, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) print('Observation space', env.observation_space) print('Action space', env.action_space) n_actions = env.action_space.n if args.recurrent: # Q-network with LSTM q_func = chainerrl.links.StatelessRecurrentSequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), functools.partial(F.reshape, shape=(-1, 3136)), F.relu, L.NStepLSTM(1, 3136, 512, 0), L.Linear(None, n_actions), DiscreteActionValue, ) # Replay buffer that stores whole episodes rbuf = replay_buffer.EpisodicReplayBuffer(10 ** 6) else: # Q-network without LSTM q_func = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), functools.partial(F.reshape, shape=(-1, 3136)), L.Linear(None, 512), F.relu, L.Linear(None, n_actions), DiscreteActionValue, ) # Replay buffer that stores transitions separately rbuf = replay_buffer.ReplayBuffer(10 ** 6) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros(env.observation_space.shape, dtype=np.float32)[None] if args.recurrent: fake_out, _ = q_func(fake_obss, None) else: fake_out = q_func(fake_obss) chainerrl.misc.draw_computational_graph( [fake_out], os.path.join(args.outdir, 'model')) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) opt = chainer.optimizers.Adam(1e-4, eps=1e-4) opt.setup(q_func) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = chainerrl.agents.DoubleDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='mean', phi=phi, minibatch_size=args.batch_size, episodic_update_len=args.episodic_update_len, recurrent=args.recurrent, ) if args.load_agent: agent.load(args.load_agent) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, checkpoint_freq=args.checkpoint_frequency, step_offset=args.step_offset, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, log_type=args.log_type ) elif (args.mode=='check'): return tools.make_video.check(env=env,agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=env,agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = chainerrl.links.Sequence( chainerrl.links.NatureDQNHead(), chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction( None, n_actions, n_atoms, v_min, v_max, n_hidden_channels=0, n_hidden_layers=0), ) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = chainer.optimizers.Adam(2.5e-4, eps=1e-2 / args.batch_size) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = chainerrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='mean', phi=phi, ) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, checkpoint_freq=args.checkpoint_frequency, step_offset=args.step_offset, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, eval_env=eval_env, log_type=args.log_type) elif (args.mode == 'check'): return tools.make_video.check(env=env, agent=agent, save_mp4=args.save_mp4) elif (args.mode == 'growth'): return tools.make_video.growth(env=env, agent=agent, outdir=args.outdir, max_num=args.max_frames, save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) agent = chokoDQN(env, args=args) # Draw the computational graph and save it in the output directory. #chainerrl.misc.draw_computational_graph( # [agent.q_func(np.zeros_like(agent.obs_space.low, dtype=np.float32)[None])], # os.path.join(args.outdir, 'model')) if args.load_agent: agent.load(args.load_agent) eval_env = make_env(test=True) if (args.mode == 'train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=args.max_episode_len, eval_max_episode_len=args.max_episode_len, log_type=args.log_type) elif (args.mode == 'check'): return tools.make_video.check(env=env, agent=agent, save_mp4=args.save_mp4) elif (args.mode == 'growth'): return tools.make_video.growth(env=env, agent=agent, outdir=args.outdir, max_num=args.max_episode_len, save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): env = gym.make(args.env) env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_size = env.observation_space.low.size action_space = env.action_space n_atoms = 51 v_max = 500 v_min = 0 n_actions = action_space.n q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_atoms, v_min, v_max, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) opt = optimizers.Adam(1e-3) opt.setup(q_func) rbuf_capacity = 50000 # 5 * 10 ** 5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = chainerrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load_agent: agent.load(args.load_agent) eval_env = make_env(test=True) if (args.mode == 'train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=timestep_limit, log_type=args.log_type) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.save_mp4) return anim
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() else: explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the Nature paper's hyperparameters opt = optimizers.RMSpropGraves(lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.num_step_return) else: rbuf = replay_buffer.ReplayBuffer(10**6, args.num_step_return) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator='sum', phi=phi) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_env=eval_env, checkpoint_freq=args.checkpoint_frequency, step_offset=args.step_offset, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_type=args.log_type) elif (args.mode == 'check'): return tools.make_video.check(env=env, agent=agent, save_mp4=args.save_mp4) elif (args.mode == 'growth'): return tools.make_video.growth(env=env, agent=agent, outdir=args.outdir, max_num=args.max_frames, save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) print('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): print("Use NAF to apply DQN to continuous action spaces") action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: print("not continuous action spaces") n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load_agent: agent.load(args.load_agent) eval_env = make_env(test=True) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=args.max_episode_len, log_type=args.log_type ) elif (args.mode=='check'): return tools.make_video.check(env=env,agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=env,agent=agent,outdir=args.outdir,max_num=args.max_episode_len,save_mp4=args.save_mp4)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) def make_env(test): env = gym.make(args.env) env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_size = env.observation_space.low.size action_space = env.action_space hidden_size = 64 q_func = chainerrl.agents.iqn.ImplicitQuantileQFunction( psi=chainerrl.links.Sequence( L.Linear(obs_size, hidden_size), F.relu, ), phi=chainerrl.links.Sequence( chainerrl.agents.iqn.CosineBasisLinear(64, hidden_size), F.relu, ), f=L.Linear(hidden_size, env.action_space.n), ) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) opt = optimizers.Adam(1e-3) opt.setup(q_func) rbuf_capacity = 50000 # 5 * 10 ** 5 rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = chainerrl.agents.IQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, ) eval_env = make_env(test=True) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=timestep_limit, log_type=args.log_type) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.save_mp4) return anim
def main(args): import logging logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use async training (--train-async), the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed, gpus=(args.gpu, )) if args.train_async: # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs if args.train_async: process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed else: env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Box): model = chainerrl.agents.pcl.PCLSeparateModel( pi=chainerrl.policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, var_wscale=1e-3, var_bias=1, var_type='diagonal', ), v=chainerrl.v_functions.FCVFunction( obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, )) else: model = chainerrl.agents.pcl.PCLSeparateModel( pi=chainerrl.policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), v=chainerrl.v_functions.FCVFunction( obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ), ) if not args.train_async and args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu(args.gpu) if args.train_async: opt = rmsprop_async.RMSpropAsync(lr=args.lr, alpha=0.99) else: opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) if args.prioritized_replay: replay_buffer = \ chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer( capacity=5 * 10 ** 3, uniform_ratio=0.1, default_priority_func=exp_return_of_episode, wait_priority_after_sampling=False, return_sample_weights=False) else: replay_buffer = chainerrl.replay_buffer.EpisodicReplayBuffer( capacity=5 * 10**3) agent = chainerrl.agents.PCL( model, opt, replay_buffer=replay_buffer, t_max=args.t_max, gamma=0.99, tau=args.tau, rollout_len=args.rollout_len, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, batchsize=args.batchsize, train_async=args.train_async, disable_online_update=args.disable_online_update, backprop_future_values=args.backprop_future_values, ) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): if args.train_async: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, step_offset=args.step_offset, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit) if args.save_agent: agent.save(args.save_agent) else: experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, test=False), eval_env=make_env(0, test=True), outdir=args.outdir, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.outdir + '/test.mp4') return anim