def _test_abc(self, steps=100000, require_success=True, gpu=-1, load_model=False): if self.recurrent and gpu >= 0: self.skipTest( 'NStepLSTM does not support double backprop with GPU.') if self.recurrent and chainer.__version__ == '7.0.0b3': self.skipTest( 'chainer==7.0.0b3 has a bug in double backrop of LSTM.' ' See https://github.com/chainer/chainer/pull/8037') env, _ = self.make_env_and_successful_return(test=False) test_env, successful_return = self.make_env_and_successful_return( test=True) agent = self.make_agent(env, gpu) if load_model: print('Load agent from', self.agent_dirname) agent.load(self.agent_dirname) max_episode_len = None if self.episodic else 2 # Train train_agent_with_evaluation( agent=agent, env=env, eval_env=test_env, steps=steps, outdir=self.tmpdir, eval_interval=200, eval_n_steps=None, eval_n_episodes=5, successful_score=successful_return, train_max_episode_len=max_episode_len, ) agent.stop_episode() # Test n_test_runs = 10 eval_returns = run_evaluation_episodes( test_env, agent, n_steps=None, n_episodes=n_test_runs, max_episode_len=max_episode_len, ) if require_success: n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) self.assertEqual(n_succeeded, n_test_runs) # Save agent.save(self.agent_dirname)
def test_timesteps(self): agent = mock.Mock() env = mock.Mock() # First episode: 0 -> 1 -> 2 -> 3 (reset) # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [('state', 0), ('state', 4)] env.step.side_effect = [ (('state', 1), 0.1, False, {}), (('state', 2), 0.2, False, {}), (('state', 3), 0.3, False, { 'needs_reset': True }), (('state', 5), -0.5, False, {}), (('state', 6), 0, False, {}), (('state', 7), 1, True, {}), ] if self.n_episodes: with self.assertRaises(AssertionError): scores = evaluator.run_evaluation_episodes( env, agent, n_steps=self.n_timesteps, n_episodes=self.n_episodes) else: scores = evaluator.run_evaluation_episodes( env, agent, n_steps=self.n_timesteps, n_episodes=self.n_episodes) if self.n_timesteps == 2: self.assertAlmostEqual(len(scores), 1) self.assertAlmostEqual(scores[0], 0.3) self.assertEqual(agent.stop_episode.call_count, 1) elif self.n_timesteps == 5: self.assertAlmostEqual(len(scores), 1) self.assertAlmostEqual(scores[0], 0.6) self.assertEqual(agent.stop_episode.call_count, 2) else: self.assertAlmostEqual(len(scores), 2) self.assertAlmostEqual(scores[0], 0.6) self.assertAlmostEqual(scores[1], 0.5) self.assertEqual(agent.stop_episode.call_count, 2)
def _test_abc(self, steps=100000, require_success=True, gpu=-1, load_model=False): env, _ = self.make_env_and_successful_return(test=False) test_env, successful_return = self.make_env_and_successful_return( test=True) agent = self.make_agent(env, gpu) if load_model: print('Load agent from', self.agent_dirname) agent.load(self.agent_dirname) max_episode_len = None if self.episodic else 2 # Train train_agent_with_evaluation( agent=agent, env=env, eval_env=test_env, steps=steps, outdir=self.tmpdir, eval_interval=200, eval_n_steps=None, eval_n_episodes=5, successful_score=successful_return, train_max_episode_len=max_episode_len, ) agent.stop_episode() # Test n_test_runs = 5 eval_returns = run_evaluation_episodes( test_env, agent, n_steps=None, n_episodes=n_test_runs, max_episode_len=max_episode_len, ) if require_success: n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) self.assertEqual(n_succeeded, n_test_runs) # Save agent.save(self.agent_dirname)
def test_needs_reset(self): agent = mock.Mock() env = mock.Mock() # First episode: 0 -> 1 -> 2 -> 3 (reset) # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [('state', 0), ('state', 4)] env.step.side_effect = [ (('state', 1), 0, False, {}), (('state', 2), 0, False, {}), (('state', 3), 0, False, {'needs_reset': True}), (('state', 5), -0.5, False, {}), (('state', 6), 0, False, {}), (('state', 7), 1, True, {}), ] scores = evaluator.run_evaluation_episodes( env, agent, n_steps=None, n_episodes=2) self.assertAlmostEqual(len(scores), 2) self.assertAlmostEqual(scores[0], 0) self.assertAlmostEqual(scores[1], 0.5)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--processes', type=int, default=16) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10 ** 7) parser.add_argument('--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--eval-eps', type=int, default=320) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load-pretrained', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--disadvantage', action='store_true', default=False, help='Set this option when you disable the advantage function.') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--random-agent', action='store_true', default=False, help='Use with demo to get random results.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2 ** 31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable( np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph( [model(fake_obs)], os.path.join(args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi, disadvantage=args.disadvantage, use_average_reward=True) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load(misc.download_model("A3C", args.env, model_type="final")[0]) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2 ** 31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) if not args.random_agent: eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print('n_steps: {} mean: {} median: {} stdev: {}'.format( args.eval_n_steps, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: agent = random_agent(envname=args.env) results = run_evaluation_episodes( env=env, agent=agent, max_episode_len=args.max_frames, n_steps=None, n_episodes=args.eval_eps) with open(os.path.join(args.outdir, 'scores.txt'), 'a+') as f: print('\n'.join(str(result) for result in results), file=f) print('episodes: {} mean: {} min: {} max: {}'.format( len(results), sum(results)/len(results), min(results), max(results))) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=True, )