def test_save_and_load(self): capacity = self.capacity num_steps = self.num_steps tempdir = tempfile.mkdtemp() rbuf = replay_buffer.PrioritizedReplayBuffer(capacity, num_steps=num_steps) # Add two transitions correct_item = collections.deque([], maxlen=num_steps) for _ in range(num_steps): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False) correct_item.append(trans1) rbuf.append(**trans1) correct_item2 = copy.deepcopy(correct_item) trans2 = dict(state=1, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) correct_item2.append(trans2) rbuf.append(**trans2) # Now it has two transitions self.assertEqual(len(rbuf), 2) # Save filename = os.path.join(tempdir, 'rbuf.pkl') rbuf.save(filename) # Initialize rbuf rbuf = replay_buffer.PrioritizedReplayBuffer(capacity, num_steps=num_steps) # Of course it has no transition yet self.assertEqual(len(rbuf), 0) # Load the previously saved buffer rbuf.load(filename) # Now it has two transitions again self.assertEqual(len(rbuf), 2) # And sampled transitions are exactly what I added! s2 = rbuf.sample(2) del s2[0][0]['weight'] del s2[1][0]['weight'] if s2[0][num_steps - 1]['state'] == 0: self.assertEqual(s2[0], list(correct_item)) self.assertEqual(s2[1], list(correct_item2)) else: self.assertEqual(s2[0], list(correct_item2)) self.assertEqual(s2[1], list(correct_item))
def test_save_and_load(self): capacity = self.capacity tempdir = tempfile.mkdtemp() rbuf = replay_buffer.PrioritizedReplayBuffer(capacity) # Add two transitions trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) rbuf.append(**trans1) trans2 = dict(state=1, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) rbuf.append(**trans2) # Now it has two transitions self.assertEqual(len(rbuf), 2) # Save filename = os.path.join(tempdir, 'rbuf.pkl') rbuf.save(filename) # Initialize rbuf rbuf = replay_buffer.PrioritizedReplayBuffer(capacity) # Of course it has no transition yet self.assertEqual(len(rbuf), 0) # Load the previously saved buffer rbuf.load(filename) # Now it has two transitions again self.assertEqual(len(rbuf), 2) # And sampled transitions are exactly what I added! s2 = rbuf.sample(2) del s2[0]['weight'] del s2[1]['weight'] if s2[0]['state'] == 0: self.assertEqual(s2[0], trans1) self.assertEqual(s2[1], trans2) else: self.assertEqual(s2[0], trans2) self.assertEqual(s2[1], trans1)
def test_fail_noupdate(self): rbuf = replay_buffer.PrioritizedReplayBuffer(100) trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) rbuf.append(**trans1) rbuf.sample(1) rbuf.sample(1) # This line must fail.
def create_agent(env): q_func = QFunction(env.grid_size ** 2, 4) start_epsilon = 1. end_epsilon = 0.8 decay_steps = 20000 explorer = explorers.LinearDecayEpsilonGreedy(start_epsilon, end_epsilon, decay_steps, env.random_action) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 3 steps = 50000 replay_start_size = 20 update_interval = 10 betasteps = (steps - replay_start_size) // update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity) phi = lambda x: x.astype(np.float32, copy=False) agent = DQN.DQN(q_func, opt, rbuf, gamma=0.99, explorer=explorer, replay_start_size=replay_start_size, phi=phi, minibatch_size=minibatch_size) return agent
def test_capacity(self): capacity = self.capacity if capacity is None: return rbuf = replay_buffer.PrioritizedReplayBuffer(capacity) # Fill the buffer for _ in range(capacity): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) rbuf.append(**trans1) self.assertEqual(len(rbuf), capacity) # Add a new transition trans2 = dict(state=1, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) rbuf.append(**trans2) # The size should not change self.assertEqual(len(rbuf), capacity)
def test_append_and_sample(self): capacity = self.capacity num_steps = self.num_steps rbuf = replay_buffer.PrioritizedReplayBuffer( capacity, normalize_by_max=self.normalize_by_max, error_max=5, num_steps=num_steps) self.assertEqual(len(rbuf), 0) # Add one and sample one correct_item = collections.deque([], maxlen=num_steps) for _ in range(num_steps): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False) correct_item.append(trans1) rbuf.append(**trans1) self.assertEqual(len(rbuf), 1) s1 = rbuf.sample(1) rbuf.update_errors([3.14]) self.assertEqual(len(s1), 1) self.assertAlmostEqual(s1[0][0]['weight'], 1.0) del s1[0][0]['weight'] self.assertEqual(s1[0], list(correct_item)) # Add two and sample two, which must be unique correct_item2 = copy.deepcopy(correct_item) trans2 = dict(state=1, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) correct_item2.append(trans2) rbuf.append(**trans2) self.assertEqual(len(rbuf), 2) s2 = rbuf.sample(2) rbuf.update_errors([3.14, 2.71]) self.assertEqual(len(s2), 2) del s2[0][0]['weight'] del s2[1][0]['weight'] if s2[0][num_steps - 1]['state'] == 1: self.assertEqual(s2[0], list(correct_item2)) self.assertEqual(s2[1], list(correct_item)) else: self.assertEqual(s2[0], list(correct_item)) self.assertEqual(s2[1], list(correct_item2)) # Weights should be different for different TD-errors s3 = rbuf.sample(2) self.assertNotAlmostEqual(s3[0][0]['weight'], s3[1][0]['weight']) # Weights should be equal for different but clipped TD-errors rbuf.update_errors([5, 10]) s3 = rbuf.sample(2) self.assertAlmostEqual(s3[0][0]['weight'], s3[1][0]['weight']) # Weights should be equal for the same TD-errors rbuf.update_errors([3.14, 3.14]) s4 = rbuf.sample(2) self.assertAlmostEqual(s4[0][0]['weight'], s4[1][0]['weight'])
def setUp(self): self.rbuf = replay_buffer.PrioritizedReplayBuffer(100) self.trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) self.rbuf.append(**self.trans1)
def test_append_and_sample(self): capacity = self.capacity rbuf = replay_buffer.PrioritizedReplayBuffer( capacity, normalize_by_max=self.normalize_by_max, error_max=5) self.assertEqual(len(rbuf), 0) # Add one and sample one trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) rbuf.append(**trans1) self.assertEqual(len(rbuf), 1) s1 = rbuf.sample(1) rbuf.update_errors([3.14]) self.assertEqual(len(s1), 1) self.assertAlmostEqual(s1[0]['weight'], 1.0) del s1[0]['weight'] self.assertEqual(s1[0], trans1) # Add two and sample two, which must be unique trans2 = dict(state=1, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True) rbuf.append(**trans2) self.assertEqual(len(rbuf), 2) s2 = rbuf.sample(2) rbuf.update_errors([3.14, 2.71]) self.assertEqual(len(s2), 2) del s2[0]['weight'] del s2[1]['weight'] if s2[0]['state'] == 0: self.assertEqual(s2[0], trans1) self.assertEqual(s2[1], trans2) else: self.assertEqual(s2[0], trans2) self.assertEqual(s2[1], trans1) # Weights should be different for different TD-errors s3 = rbuf.sample(2) self.assertNotAlmostEqual(s3[0]['weight'], s3[1]['weight']) # Weights should be equal for different but clipped TD-errors rbuf.update_errors([5, 10]) s3 = rbuf.sample(2) self.assertAlmostEqual(s3[0]['weight'], s3[1]['weight']) # Weights should be equal for the same TD-errors rbuf.update_errors([3.14, 3.14]) s4 = rbuf.sample(2) self.assertAlmostEqual(s4[0]['weight'], s4[1]['weight'])
def dqn_q_values_and_neuronal_net(self, args, action_space, obs_size, obs_space): """ learning process """ if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n # print("n_actions: ", n_actions) q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # print("q_func ", q_func) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) # print("explorer: ", explorer) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # print("obs_space.low : ", obs_space.shape) chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) return q_func, opt, rbuf, explorer
def buffer(self): rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval return replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: return replay_buffer.ReplayBuffer(rbuf_capacity)
def test(self): n = 5 if self.replay_buffer_type == 'ReplayBuffer': rbuf = replay_buffer.ReplayBuffer(capacity=None, num_steps=n) elif self.replay_buffer_type == 'PrioritizedReplayBuffer': rbuf = replay_buffer.PrioritizedReplayBuffer(capacity=None, num_steps=n) else: assert False # 2 transitions for env_id=0 for _ in range(2): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False) rbuf.append(env_id=0, **trans1) # 4 transitions for env_id=1 with a terminal state for i in range(4): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=(i == 3)) rbuf.append(env_id=1, **trans1) # 9 transitions for env_id=2 for _ in range(9): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False) rbuf.append(env_id=2, **trans1) # It should have: # - 4 transitions from env_id=1 # - 5 transitions from env_id=2 self.assertEqual(len(rbuf), 9) # env_id=0 episode ends rbuf.stop_current_episode(env_id=0) # Now it should have 9 + 2 = 11 transitions self.assertEqual(len(rbuf), 11) # env_id=2 episode ends rbuf.stop_current_episode(env_id=2) # Finally it should have 9 + 2 + 4 = 15 transitions self.assertEqual(len(rbuf), 15)
def test_normalize_by_max(self): rbuf = replay_buffer.PrioritizedReplayBuffer( self.capacity, normalize_by_max=self.normalize_by_max, error_max=1000, num_steps=self.num_steps, ) # Add 100 transitions for i in range(100): trans = dict(state=i, action=1, reward=2, next_state=i + 1, next_action=1, is_state_terminal=False) rbuf.append(**trans) assert len(rbuf) == 100 def set_errors_based_on_state(rbuf, samples): # Use the value of 'state' as an error, so that state 0 will have # the smallest error, thus the largest weight errors = [s[0]['state'] for s in samples] rbuf.update_errors(errors) # Assign different errors to all the transitions first samples = rbuf.sample(100) set_errors_based_on_state(rbuf, samples) # Repeatedly check how weights are normalized for i in range(100): samples = rbuf.sample(i + 1) # All the weights must be unique self.assertEqual(len(set(s[0]['weight'] for s in samples)), len(samples)) # Now check the maximum weight in a minibatch max_w = max([s[0]['weight'] for s in samples]) if self.normalize_by_max == 'batch': # Maximum weight in a minibatch must be 1 self.assertAlmostEqual(max_w, 1) elif self.normalize_by_max == 'memory': # Maximum weight in a minibatch must be less than 1 unless # the minibatch contains the transition of least error. if any(s[0]['state'] == 0 for s in samples): self.assertAlmostEqual(max_w, 1) else: self.assertLess(max_w, 1) set_errors_based_on_state(rbuf, samples)
def create_agent(env): state_size = env.state_size action_size = env.action_size q_func = QFunction(state_size, action_size) start_epsilon = 1. end_epsilon = 0.3 decay_steps = feature_max_count * MAX_EPISODE / 2 explorer = explorers.LinearDecayEpsilonGreedy(start_epsilon, end_epsilon, decay_steps, env.random_action) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**3 minibatch_size = 16 steps = 1000 replay_start_size = 20 update_interval = 10 betasteps = (steps - replay_start_size) // update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) phi = lambda x: x.astype(np.float32, copy=False) agent = DDQN.DoubleDQN( q_func, opt, rbuf, gamma=0.99, explorer=explorer, replay_start_size=replay_start_size, target_update_interval=10, # target q网络多久和q网络同步 update_interval=update_interval, phi=phi, minibatch_size=minibatch_size, target_update_method='hard', soft_update_tau=1e-2, episodic_update=False, gpu=args.gpu, # 设置是否使用gpu episodic_update_len=16) return agent
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10**6) parser.add_argument('--final-epsilon', type=float, default=0.01) parser.add_argument('--eval-epsilon', type=float, default=0.001) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--arch', type=str, default='doubledqn', choices=['nature', 'nips', 'dueling', 'doubledqn']) parser.add_argument('--steps', type=int, default=5 * 10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--replay-start-size', type=int, default=5 * 10**4) parser.add_argument('--target-update-interval', type=int, default=3 * 10**4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--update-interval', type=int, default=4) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--no-clip-delta', dest='clip_delta', action='store_false') parser.set_defaults(clip_delta=True) parser.add_argument('--agent', type=str, default='DoubleDQN', choices=['DQN', 'DoubleDQN', 'PAL']) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate') parser.add_argument('--prioritized', action='store_true', default=False, help='Use prioritized experience replay.') parser.add_argument('--num-envs', type=int, default=1) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, frame_stack=False, ) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) env.seed(env_seed) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): vec_env = chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) vec_env = chainerrl.wrappers.VectorFrameStack(vec_env, 4) return vec_env sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves(lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(10**6, alpha=0.6, beta0=0.4, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator='sum', phi=phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='CartPole-v1') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=1000) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=10**8) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--episodic-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=50) parser.add_argument('--target-update-interval', type=int, default=100) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=1000) parser.add_argument('--n-hidden-channels', type=int, default=12) parser.add_argument('--n-hidden-layers', type=int, default=3) parser.add_argument('--gamma', type=float, default=0.95) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1.0) args = parser.parse_args() # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): env = gym.make(args.env) env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_size = env.observation_space.low.size action_space = env.action_space n_atoms = 51 v_max = 500 v_min = 0 n_actions = action_space.n q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_atoms, v_min, v_max, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) opt = optimizers.Adam(1e-3) opt.setup(q_func) rbuf_capacity = 50000 # 5 * 10 ** 5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = chainerrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=10**5) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=10**4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e-3) args = parser.parse_args() # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='dqn_out') parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=10**5) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--episodic-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=10**4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e-3) args = parser.parse_args() args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) if args.seed is not None: misc.set_random_seed(args.seed) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(for_eval): env = gym.make(args.env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not for_eval: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if ((args.render_eval and for_eval) or (args.render_train and not for_eval)): misc.env_modifiers.make_rendered(env) return env env = make_env(for_eval=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) def phi(obs): return obs.astype(np.float32) agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = make_env(for_eval=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=123, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--final-exploration-steps', type=int, default=10 ** 4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=50000) parser.add_argument('--prioritized-replay', action='store_true', default=False) parser.add_argument('--episodic-replay', action='store_true', default=False) parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10 ** 2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=50) parser.add_argument('--eval-interval', type=int, default=10 ** 3) parser.add_argument('--n-hidden-channels', type=int, default=512) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true', default=True) parser.add_argument('--reward-scale-factor', type=float, default=1e-3) args = parser.parse_args() # Set a random seed used in ChainerRL misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): ENV_NAME = 'malware-test-v0' if test else 'malware-v0' env = gym.make(ENV_NAME) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) # if not test: # misc.env_modifiers.make_reward_filtered( # env, lambda x: x * args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): misc.env_modifiers.make_rendered(env) return env env = make_env(test=False) timestep_limit = 80 obs_space = env.observation_space obs_size = obs_space.shape[0] action_space = env.action_space n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) if args.gpu >= 0: q_func.to_gpu(args.gpu) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. if args.gpu < 0: chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) def phi(obs): return obs.astype(np.float32) agent = DoubleDQN(q_func, opt, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: q_hook = PlotHook('Average Q Value') loss_hook = PlotHook('Average Loss', plot_index=1) experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit, step_hooks=[q_hook, loss_hook], successful_score=7 )
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not os.path.exists(args.outdir): os.makedirs(args.outdir) print('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) env = chainerrl.wrappers.CastObservationToFloat32(env) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): print("Use NAF to apply DQN to continuous action spaces") action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: print("not continuous action spaces") n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DoubleDQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load_agent: agent.load(args.load_agent) eval_env = make_env(test=True) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, train_max_episode_len=timestep_limit, log_type=args.log_type ) elif (args.mode=='check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode = 'rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0),dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),interval=50) anim.save(args.save_mp4) return anim
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='/tmp/chainerRL_results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--evaluate', action='store_true', default=False, help="Run evaluation mode") parser.add_argument('--load', type=str, default=None, help="Load saved_model") parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=11) parser.add_argument('--n-hidden-channels', type=int, default=50) parser.add_argument('--n-hidden-layers', type=int, default=1) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--reward-scale-factor', type=float, default=1) parser.add_argument('--outdir-time-suffix', choices=['empty', 'none', 'time'], default='empty', type=str.lower) parser.add_argument('--checkpoint_frequency', type=int, default=1e3, help="Nuber of steps to checkpoint after") parser.add_argument('--verbose', '-v', action='store_true', help='Use debug log-level') parser.add_argument('--scenario', choices=[ '1D-INST', '1D-DIST', '1DM', '2DM', '3DM', '5DM', '1D3M', '2D3M', '3D3M', '5D3M' ], default='1D-INST', type=str.upper, help='Which scenario to use.') if __name__ != '__main__': print(__name__) parser.add_argument( '--timeout', type=int, default=0, help='Wallclock timeout in sec') # Has no effect in this file! # can only be used in conjunction with "train_with_wallclock_limit.py"! args = parser.parse_args() import logging logging.basicConfig( level=logging.INFO if not args.verbose else logging.DEBUG) # Set a random seed used in ChainerRL ALSO SETS NUMPY SEED! misc.set_random_seed(args.seed) if args.outdir and not args.load: outdir_suffix_dict = { 'none': '', 'empty': '', 'time': '%Y%m%dT%H%M%S.%f' } args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv, time_format=outdir_suffix_dict[args.outdir_time_suffix]) elif args.load: if args.load.endswith(os.path.sep): args.load = args.load[:-1] args.outdir = os.path.dirname(args.load) count = 0 fn = os.path.join(args.outdir.format(count), 'scores_{:>03d}') while os.path.exists(fn.format(count)): count += 1 os.rename(os.path.join(args.outdir, 'scores.txt'), fn.format(count)) if os.path.exists(os.path.join(args.outdir, 'best')): os.rename(os.path.join(args.outdir, 'best'), os.path.join(args.outdir, 'best_{:>03d}'.format(count))) logging.info('Output files are saved in {}'.format(args.outdir)) def make_env(test): if args.scenario == '1D-INST': # Used to create Figures 2(b)&(c) env = SigMV(instance_feats=os.path.join( os.path.dirname(os.path.realpath(__file__)), '..', 'envs', 'feats.csv' if not test else 'test_feats.csv'), seed=args.seed, n_actions=1, action_vals=(2, )) elif args.scenario == '1D-DIST': # Used to create Figure 2(a) env_seed = 2**32 - 1 - args.seed if test else args.seed env = SigMV(seed=env_seed, n_actions=1, action_vals=(2, )) elif args.scenario == '1D3M': # Used to create Figure 3(a) env_seed = 2**32 - 1 - args.seed if test else args.seed env = SigMV(n_actions=1, action_vals=(3, ), seed=env_seed) elif args.scenario == '2D3M': # Used to create Figure 3(b) env_seed = 2**32 - 1 - args.seed if test else args.seed env = SigMV(n_actions=2, action_vals=(3, 3), seed=env_seed) elif args.scenario == '3D3M': # Used to create Figure 3(c) env_seed = 2**32 - 1 - args.seed if test else args.seed env = SigMV(n_actions=3, action_vals=(3, 3, 3), seed=env_seed) elif args.scenario == '5D3M': # Used to create Figure 3(d) env_seed = 2**32 - 1 - args.seed if test else args.seed env = SigMV(n_actions=5, action_vals=(3, 3, 3, 3, 3), seed=env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) return env env = make_env(test=False) timestep_limit = 10**3 # TODO don't hardcode env params obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. if not args.load: chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam(eps=1e-2) opt.setup(q_func) opt.add_hook(GradientClipping(5)) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DDQN( q_func, opt, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) t_offset = 0 if args.load: # Continue training model or load for evaluation agent.load(args.load) rbuf.load(os.path.join(args.load, 'replay_buffer.pkl')) try: t_offset = int(os.path.basename(args.load).split('_')[0]) except TypeError: with open(os.path.join(args.load, 't.txt'), 'r') as fh: data = fh.readlines() t_offset = int(data[0]) except ValueError: t_offset = 0 eval_env = make_env(test=True) if args.evaluate: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: criterion = 'steps' # can be made an argument if we support any other form of checkpointing l = logging.getLogger('Checkpoint_Hook') def checkpoint(env, agent, step): if criterion == 'steps': if step % args.checkpoint_frequency == 0: save_agent_and_replay_buffer( agent, step, args.outdir, suffix='_chkpt', logger=l, chckptfrq=args.checkpoint_frequency) else: # TODO seems to checkpoint given wall_time we would have to modify the environment such that it tracks # time or number of episodes raise NotImplementedError def eval_hook(env, agent, step): """ Necessary hook to evaluate the DDQN on all 100 Training instances. :param env: The training environment :param agent: (Partially) Trained agent :param step: Number of observed training steps. :return: """ if step % 10 == 0: # train_reward = 0 for _ in range(100): obs = env.reset() done = False rews = 0 while not done: obs, r, done, _ = env.step(agent.act(obs)) rews += r train_reward += rews train_reward = train_reward / 100 with open(os.path.join(args.outdir, 'train_reward.txt'), 'a') as fh: fh.writelines(str(train_reward) + '\t' + str(step) + '\n') hooks = [checkpoint] if args.scenario == '1D-INST': hooks.append(eval_hook) experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps= None, # unlimited number of steps per evaluation rollout eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit, step_hooks=hooks, step_offset=t_offset)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='CarIntersect-v3') parser.add_argument('--outdir', type=str, default='train/results', help='Directory path to save output files.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', action='store_true', default=None) parser.add_argument('--train', action='store_true', default=None) parser.add_argument('--eval-epsilon', type=float, default=0.0) parser.add_argument('--noisy-net-sigma', type=float, default=0.5) parser.add_argument('--steps', type=int, default=2 * 10**6) parser.add_argument('--replay-start-size', type=int, default=2 * 10**4) parser.add_argument('--eval-n-episodes', type=int, default=5) parser.add_argument('--eval-interval', type=int, default=10**4) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env.') parser.add_argument('--num-envs', type=int, default=40) parser.add_argument('--final-epsilon', type=float, default=0.01) parser.add_argument('--final-exploration-frames', type=int, default=2 * 10**4) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs def make_car_env_discrete(max_frames=30 * 30, env_seed=42, random_suffix=None): print('CarIntersect-v3') env = gym.make('CarIntersect-v3') env = chainerrl.wrappers.ContinuingTimeLimit( env, max_episode_steps=max_frames) env = MaxAndSkipEnv(env, skip=4) env = DiscreteWrapper(env) print('save_wrapper') env = SaveWrapper(env, random_suffix=random_suffix) env = WarpFrame(env) env.seed(env_seed) return env def make_batch_env(test): vec_env = chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_car_env_discrete) for _, _ in enumerate(range(args.num_envs)) ]) vec_env = chainerrl.wrappers.VectorFrameStack(vec_env, 4) # print(vec_env.observation_space) return vec_env env = make_batch_env(test=False) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max, n_input_channels=12) # Noisy nets links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.LinearDecayEpsilonGreedy( 0.3, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) # Draw the computational graph and save it in the output directory. # chainerrl.misc.draw_computational_graph( # [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], # os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = chainer.optimizers.Adam(0.00025, eps=1.5 * 10**-4) opt.setup(q_func) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = args.steps / update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(10**5, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=10) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.CategoricalDoubleDQN print(args.replay_start_size) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, minibatch_size=64, replay_start_size=args.replay_start_size, target_update_interval=3 * 10**3, update_interval=update_interval, batch_accumulator='mean', phi=phi, ) if args.load is True: print('evaluation started') dir_of_best_network = os.path.join("train/", "best") agent.load(dir_of_best_network) stats = experiments.evaluator.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=10, logger=None) print(stats) if args.train or not args.load: print('training started') experiments.train_agent_batch_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_episodes, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, log_interval=1000, )
def create_ddqn_agent(env, args): obs_size = env.observation_space.shape[0] action_space = env.action_space n_actions = action_space.n # q_func = q_functions.FCStateQFunctionWithDiscreteAction( # obs_size, n_actions, # n_hidden_channels=args.n_hidden_channels, # n_hidden_layers=args.n_hidden_layers) q_func = QFunction(obs_size, n_actions) if args.gpu: q_func.to_gpu(args.gpu) # Draw the computational graph and save it in the output directory. if not args.test and not args.gpu: chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(env.observation_space, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) # explorer = explorers.Boltzmann() # explorer = explorers.ConstantEpsilonGreedy( # epsilon=0.3, random_action_func=env.action_space.sample) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 3 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) # Chainer only accepts numpy.float32 by default, make sure # a converter as a feature extractor function phi. phi = lambda x: x.astype(np.float32, copy=False) agent = chainerrl.agents.DoubleDQN(q_func, opt, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) return agent
def chokoDQN(env, args=None): args = args or [] if (type(args) is list): args = make_args(args) obs_space = env.observation_space obs_size = obs_space.low.size * args.stack_k action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) return agent
def main(): import logging logging.basicConfig(level=logging.WARNING) args = parser.parse_args() args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) if args.seed is not None: misc.set_random_seed(args.seed) option2id, all_guesses = load_quizbowl() train_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD], option2id, batch_size=1, make_vector=dense_vector) env = BuzzingGame(train_iter) timestep_limit = 300 obs_size = env.observation_size action_space = env.action_space n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.replay_start_size is None: args.replay_start_size = 10 if args.prioritized_replay: betasteps = \ (args.steps - timestep_limit * args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.replay_start_size is None: args.replay_start_size = 1000 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) def phi(obs): return obs.astype(np.float32) agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = BuzzingGame(train_iter) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit) serializers.save_npz('dqn.npz', q_func) dev_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD], option2id, batch_size=128, make_vector=dense_vector) dev_buzzes = get_buzzes(q_func, dev_iter) dev_buzzes_dir = 'output/buzzer/rl/dev_buzzes.pkl' with open(dev_buzzes_dir, 'wb') as f: pickle.dump(dev_buzzes, f) print('Dev buzz {} saved to {}'.format(len(dev_buzzes), dev_buzzes_dir)) report(dev_buzzes_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='MarLo-FindTheGoal-v0', help='Marlo env to perform algorithm on.') parser.add_argument('--out_dir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10**6, help='Timesteps after which we stop ' + 'annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.01, help='Final value of epsilon during training.') parser.add_argument('--eval-epsilon', type=float, default=0.001, help='Exploration epsilon used during eval episodes.') parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--arch', type=str, default='nature', choices=['nature', 'nips', 'dueling', 'doubledqn'], help='Network architecture to use.') parser.add_argument('--steps', type=int, default=5 * 10**7, help='Total number of timesteps to train the agent.') parser.add_argument( '--max-episode-len', type=int, default=30 * 60 * 60 // 4, # 30 minutes with 60/4 fps help='Maximum number of timesteps for each episode.') parser.add_argument('--replay-start-size', type=int, default=5 * 10**4, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=3 * 10**4, help='Frequency (in timesteps) at which ' + 'the target network is updated.') parser.add_argument('--eval-interval', type=int, default=10**5, help='Frequency (in timesteps) of evaluation phase.') parser.add_argument('--update-interval', type=int, default=4, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--agent', type=str, default='DQN', choices=['DQN', 'DoubleDQN', 'PAL']) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') parser.add_argument('--prioritized', action='store_true', default=False, help='Use prioritized experience replay.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) print('Output files are saved in {}'.format(args.out_dir)) env = make_env(args.env, env_seed=args.seed, demo=args.demo) n_actions = env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func) # Turn off explorer explorer = explorers.Greedy() # Use the Nature paper's hyperparameters opt = optimizers.RMSpropGraves(lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(10**6, alpha=0.6, beta0=0.4, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor x = x.transpose(2, 0, 1) return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='sum', phi=phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.out_dir, save_best_so_far_agent=False, max_episode_len=args.max_episode_len, eval_env=env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='OpenAI Atari domain to perform algorithm on.') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10**6, help='Timesteps after which we stop ' + 'annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.01, help='Final value of epsilon during training.') parser.add_argument('--eval-epsilon', type=float, default=0.001, help='Exploration epsilon used during eval episodes.') parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--arch', type=str, default='doubledqn', choices=['nature', 'nips', 'dueling', 'doubledqn'], help='Network architecture to use.') parser.add_argument('--steps', type=int, default=5 * 10**7, help='Total number of timesteps to train the agent.') parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--replay-start-size', type=int, default=5 * 10**4, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=3 * 10**4, help='Frequency (in timesteps) at which ' + 'the target network is updated.') parser.add_argument('--eval-interval', type=int, default=10**5, help='Frequency (in timesteps) of evaluation phase.') parser.add_argument('--update-interval', type=int, default=4, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--no-clip-delta', dest='clip_delta', action='store_false') parser.add_argument('--num-step-return', type=int, default=1) parser.set_defaults(clip_delta=True) parser.add_argument('--agent', type=str, default='DoubleDQN', choices=['DQN', 'DoubleDQN', 'PAL']) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') parser.add_argument('--prioritized', action='store_true', default=False, help='Use prioritized experience replay.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() else: explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the Nature paper's hyperparameters opt = optimizers.RMSpropGraves(lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.num_step_return) else: rbuf = replay_buffer.ReplayBuffer(10**6, args.num_step_return) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator='sum', phi=phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, eval_env=eval_env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='/tmp/chainerRL_results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--evaluate', action='store_true', default=False, help="Run evaluation mode") parser.add_argument('--load', type=str, default=None, help="Load saved_model") parser.add_argument('--steps', type=int, default=4 * 10**6) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=5 * 10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=1) parser.add_argument('--eval-interval', type=int, default=1e4, help="After how many steps to evaluate the agent." "(-1 -> always)") parser.add_argument('--n-hidden-channels', type=int, default=20) parser.add_argument('--n-hidden-layers', type=int, default=20) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1) parser.add_argument('--time-step-limit', type=int, default=1e5) parser.add_argument('--outdir-time-suffix', choices=['empty', 'none', 'time'], default='empty', type=str.lower) parser.add_argument('--checkpoint_frequency', type=int, default=1e3, help="Nuber of steps to checkpoint after") parser.add_argument('--verbose', '-v', action='store_true', help='Use debug log-level') args = parser.parse_args() import logging logging.basicConfig( level=logging.INFO if not args.verbose else logging.DEBUG) # Set a random seed used in ChainerRL ALSO SETS NUMPY SEED! misc.set_random_seed(args.seed) if args.outdir and not args.load: outdir_suffix_dict = { 'none': '', 'empty': '', 'time': '%Y%m%dT%H%M%S.%f' } args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv, time_format=outdir_suffix_dict[args.outdir_time_suffix]) elif args.load: if args.load.endswith(os.path.sep): args.load = args.load[:-1] args.outdir = os.path.dirname(args.load) count = 0 fn = os.path.join(args.outdir.format(count), 'scores_{:>03d}') while os.path.exists(fn.format(count)): count += 1 os.rename(os.path.join(args.outdir, 'scores.txt'), fn.format(count)) if os.path.exists(os.path.join(args.outdir, 'best')): os.rename(os.path.join(args.outdir, 'best'), os.path.join(args.outdir, 'best_{:>03d}'.format(count))) logging.info('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): HOST = '' # The server's hostname or IP address PORT = 54321 # The port used by the server if test: # Just such that eval and train env don't have the same port PORT += 1 # TODO don't hardcode env params # TODO if we use this solution (i.e. write port to file and read it with FD) we would have to make sure that # outdir doesn't append time strings. Otherwise it will get hard to use on the cluster env = FDEnvSelHeur(host=HOST, port=PORT, num_heuristics=2, config_dir=args.outdir) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) # state = env.reset() # while True: # for x in [1,1,1,1,0,0,0,0]: # state, reward, done, _ = env.step(x) # print(x) # if done: # break timestep_limit = args.time_step_limit obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): # Usefull if we want to control action_size = action_space.low.size # other continous parameters # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # q_func = FCDuelingDQN( # obs_size, n_actions) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. if not args.load: chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam(eps=1e-2) logging.info('Optimizer: %s', str(opt)) opt.setup(q_func) opt.add_hook(GradientClipping(5)) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 # args.minibatch_size = 16 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = DDQN( q_func, opt, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) t_offset = 0 if args.load: # Continue training model or load for evaluation agent.load(args.load) rbuf.load(os.path.join(args.load, 'replay_buffer.pkl')) try: t_offset = int(os.path.basename(args.load).split('_')[0]) except TypeError: with open(os.path.join(args.load, 't.txt'), 'r') as fh: data = fh.readlines() t_offset = int(data[0]) except ValueError: t_offset = 0 eval_env = make_env(test=False) if args.evaluate: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: criterion = 'steps' # can be made an argument if we support any other form of checkpointing l = logging.getLogger('Checkpoint_Hook') def checkpoint(env, agent, step): if criterion == 'steps': if step % args.checkpoint_frequency == 0: save_agent_and_replay_buffer( agent, step, args.outdir, suffix='_chkpt', logger=l, chckptfrq=args.checkpoint_frequency) else: # TODO seems to checkpoint given wall_time we would have to modify the environment such that it tracks # time or number of episodes raise NotImplementedError hooks = [checkpoint] experiments.train_agent(agent=agent, env=env, steps=args.steps, outdir=args.outdir, step_hooks=hooks, step_offset=t_offset)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--eval-epsilon', type=float, default=0.0) parser.add_argument('--noisy-net-sigma', type=float, default=0.5) parser.add_argument('--steps', type=int, default=5 * 10 ** 7) parser.add_argument('--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--replay-start-size', type=int, default=2 * 10 ** 4) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--n-best-episodes', type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2 ** 31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max,) # Noisy nets links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = chainer.optimizers.Adam(6.25e-5, eps=1.5 * 10 ** -4) opt.setup(q_func) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = args.steps / update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( 10 ** 6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=3, normalize_by_max='memory', ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.CategoricalDoubleDQN agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, minibatch_size=32, replay_start_size=args.replay_start_size, target_update_interval=32000, update_interval=update_interval, batch_accumulator='mean', phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print('n_episodes: {} mean: {} median: {} stdev {}'.format( eval_stats['episodes'], eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames/4, logger=None) with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f: # temporary hack to handle python 2/3 support issues. # json dumps does not support non-string literal dict keys json_stats = json.dumps(stats) print(str(json_stats), file=f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2 ** 31 - 1 - args.seed def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = chainerrl.agents.iqn.ImplicitQuantileQFunction( psi=chainerrl.links.Sequence( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, functools.partial(F.reshape, shape=(-1, 3136)), ), phi=chainerrl.links.Sequence( chainerrl.agents.iqn.CosineBasisLinear(64, 3136), F.relu, ), f=chainerrl.links.Sequence( L.Linear(None, 512), F.relu, L.Linear(None, n_actions), ), ) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros((4, 84, 84), dtype=np.float32)[None] fake_taus = np.zeros(32, dtype=np.float32)[None] chainerrl.misc.draw_computational_graph( [q_func(fake_obss)(fake_taus)], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1710.10044 opt = chainer.optimizers.Adam(5e-5, eps=1e-2 / args.batch_size) opt.setup(q_func) if args.prioritized: betasteps = args.steps / args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( 10 ** 6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=args.num_step_return) else: rbuf = replay_buffer.ReplayBuffer( 10 ** 6, num_steps=args.num_step_return) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator=args.batch_accumulator, phi=phi, quantile_thresholds_N=args.quantile_thresholds_N, quantile_thresholds_N_prime=args.quantile_thresholds_N_prime, quantile_thresholds_K=args.quantile_thresholds_K, ) if args.load_agent: agent.load(args.load_agent) if (args.mode=='train'): experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, checkpoint_freq=args.checkpoint_frequency, step_offset=args.step_offset, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, log_type=args.log_type ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None) with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f: # temporary hack to handle python 2/3 support issues. # json dumps does not support non-string literal dict keys json_stats = json.dumps(stats) print(str(json_stats), file=f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat])) elif (args.mode=='check'): return tools.make_video.check(env=env,agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=env,agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False, help='Evaluate the agent without training.') parser.add_argument('--load', type=str, default=None, help='Load a saved agent from a given directory.') parser.add_argument('--final-exploration-steps', type=int, default=5 * 10 ** 5, help='Timesteps after which we stop' ' annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.2, help='Final value of epsilon during training.') parser.add_argument('--steps', type=int, default=2 * 10 ** 6, help='Total number of timesteps to train the agent.') parser.add_argument('--replay-start-size', type=int, default=5 * 10 ** 4, help='Minimum replay buffer size before' ' performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=1 * 10 ** 4, help='Frequency (in timesteps) at which' ' the target network is updated.') parser.add_argument('--eval-interval', type=int, default=10 ** 5, help='Frequency (in timesteps) of evaluation phase.') parser.add_argument('--update-interval', type=int, default=1, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=100, help='Number of episodes used for evaluation.') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--lr', type=float, default=6.25e-5, help='Learning rate') parser.add_argument('--num-envs', type=int, default=1, help='Number of envs run in parallel.') parser.add_argument('--batch-size', type=int, default=32, help='Batch size used for training.') parser.add_argument('--record', action='store_true', default=False, help='Record videos of evaluation envs.' ' --render should also be specified.') parser.add_argument('--gamma', type=float, default=0.99, help='Discount factor.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) max_episode_steps = 8 def make_env(idx, test): from pybullet_envs.bullet.kuka_diverse_object_gym_env import KukaDiverseObjectEnv # NOQA # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed # Set a random seed for this subprocess misc.set_random_seed(env_seed) env = KukaDiverseObjectEnv( isDiscrete=True, renders=args.render and (args.demo or not test), height=84, width=84, maxSteps=max_episode_steps, isTest=test, ) # (84, 84, 3) -> (3, 84, 84) env = TransposeObservation(env, (2, 0, 1)) env = ObserveElapsedSteps(env, max_episode_steps) # KukaDiverseObjectEnv internally asserts int actions and does not # accept python-future's newint. env = CastAction(env, __builtins__.int) env.seed(int(env_seed)) if test and args.record: assert args.render,\ 'To use --record, --render needs be specified.' video_dir = os.path.join(args.outdir, 'video_{}'.format(idx)) os.mkdir(video_dir) env = RecordMovie(env, video_dir) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv( [functools.partial(make_env, idx, test) for idx in range(args.num_envs)]) eval_env = make_batch_env(test=True) n_actions = eval_env.action_space.n q_func = GraspingQFunction(n_actions, max_episode_steps) # Draw the computational graph and save it in the output directory. fake_obs = ( np.zeros((3, 84, 84), dtype=np.float32)[None], np.zeros((), dtype=np.int32)[None], ) chainerrl.misc.draw_computational_graph( [q_func(fake_obs)], os.path.join(args.outdir, 'model')) # Use the hyper parameters of the Nature paper opt = optimizers.RMSpropGraves( lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( 10 ** 6, alpha=0.6, beta0=0.4, betasteps=betasteps) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_steps, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor image, elapsed_steps = x # Normalize RGB values: [0, 255] -> [0, 1] norm_image = np.asarray(image, dtype=np.float32) / 255 return norm_image, elapsed_steps agent = chainerrl.agents.DoubleDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, minibatch_size=args.batch_size, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='sum', phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=eval_env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )