def test_save_and_load(self): capacity = self.capacity num_steps = self.num_steps tempdir = tempfile.mkdtemp() rbuf = replay_buffers.ReplayBuffer(capacity, num_steps) correct_item = collections.deque([], maxlen=num_steps) # Add two transitions for _ in range(num_steps): trans1 = dict( state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False, ) correct_item.append(trans1) rbuf.append(**trans1) correct_item2 = copy.deepcopy(correct_item) trans2 = dict( state=1, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False, ) correct_item2.append(trans2) rbuf.append(**trans2) # Now it has two transitions assert len(rbuf) == 2 # Save filename = os.path.join(tempdir, "rbuf.pkl") rbuf.save(filename) # Initialize rbuf rbuf = replay_buffers.ReplayBuffer(capacity) # Of course it has no transition yet assert len(rbuf) == 0 # Load the previously saved buffer rbuf.load(filename) # Now it has two transitions again assert len(rbuf) == 2 # And sampled transitions are exactly what I added! s2 = rbuf.sample(2) if s2[0][num_steps - 1]["state"] == 0: assert s2[0] == list(correct_item) assert s2[1] == list(correct_item2) else: assert s2[0] == list(correct_item2) assert s2[1] == list(correct_item)
def test(self): n = 5 if self.replay_buffer_type == "ReplayBuffer": rbuf = replay_buffers.ReplayBuffer(capacity=None, num_steps=n) elif self.replay_buffer_type == "PrioritizedReplayBuffer": rbuf = replay_buffers.PrioritizedReplayBuffer(capacity=None, num_steps=n) else: assert False # 2 transitions for env_id=0 for _ in range(2): trans1 = dict( state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False, ) rbuf.append(env_id=0, **trans1) # 4 transitions for env_id=1 with a terminal state for i in range(4): trans1 = dict( state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=(i == 3), ) rbuf.append(env_id=1, **trans1) # 9 transitions for env_id=2 for _ in range(9): trans1 = dict( state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False, ) rbuf.append(env_id=2, **trans1) # It should have: # - 4 transitions from env_id=1 # - 5 transitions from env_id=2 assert len(rbuf) == 9 # env_id=0 episode ends rbuf.stop_current_episode(env_id=0) # Now it should have 9 + 2 = 11 transitions assert len(rbuf) == 11 # env_id=2 episode ends rbuf.stop_current_episode(env_id=2) # Finally it should have 9 + 2 + 4 = 15 transitions assert len(rbuf) == 15
def _test_load_rainbow(self, gpu): from pfrl.q_functions import DistributionalDuelingDQN q_func = DistributionalDuelingDQN(4, 51, -10, 10) pnn.to_factorized_noisy(q_func, sigma_scale=0.5) explorer = explorers.Greedy() opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4) rbuf = replay_buffers.ReplayBuffer(100) agent = agents.CategoricalDoubleDQN( q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, minibatch_size=32, replay_start_size=50, target_update_interval=32000, update_interval=4, batch_accumulator="mean", phi=lambda x: x, ) downloaded_model, exists = download_model( "Rainbow", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def _test_load_ddpg(self, gpu): obs_size = 11 action_size = 3 from pfrl.nn import ConcatObsAndAction q_func = nn.Sequential( ConcatObsAndAction(), nn.Linear(obs_size + action_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, 1), ) from pfrl.nn import BoundByTanh from pfrl.policies import DeterministicHead policy = nn.Sequential( nn.Linear(obs_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, action_size), BoundByTanh(low=[-1.0, -1.0, -1.0], high=[1.0, 1.0, 1.0]), DeterministicHead(), ) opt_a = torch.optim.Adam(policy.parameters()) opt_c = torch.optim.Adam(q_func.parameters()) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1.0, -1.0, -1.0], high=[1.0, 1.0, 1.0]) agent = agents.DDPG( policy, q_func, opt_a, opt_c, replay_buffers.ReplayBuffer(100), gamma=0.99, explorer=explorer, replay_start_size=1000, target_update_method="soft", target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=gpu, minibatch_size=100, burnin_action_func=None, ) downloaded_model, exists = download_model( "DDPG", "Hopper-v2", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def _test_load_td3(self, gpu): obs_size = 11 action_size = 3 def make_q_func_with_optimizer(): q_func = nn.Sequential( pnn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters()) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() policy = nn.Sequential( nn.Linear(obs_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, action_size), nn.Tanh(), pfrl.policies.DeterministicHead(), ) policy_optimizer = torch.optim.Adam(policy.parameters()) rbuf = replay_buffers.ReplayBuffer(100) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1.0, -1.0, -1.0], high=[1.0, 1.0, 1.0]) agent = agents.TD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=1000, gpu=gpu, minibatch_size=100, burnin_action_func=None, ) downloaded_model, exists = download_model( "TD3", "Hopper-v2", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def _test_load_iqn(self, gpu): n_actions = 4 q_func = pfrl.agents.iqn.ImplicitQuantileQFunction( psi=nn.Sequential( nn.Conv2d(4, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, 3, stride=1), nn.ReLU(), nn.Flatten(), ), phi=nn.Sequential( pfrl.agents.iqn.CosineBasisLinear(64, 3136), nn.ReLU(), ), f=nn.Sequential( nn.Linear(3136, 512), nn.ReLU(), nn.Linear(512, n_actions), ), ) # Use the same hyper parameters as https://arxiv.org/abs/1710.10044 opt = torch.optim.Adam(q_func.parameters(), lr=5e-5, eps=1e-2 / 32) rbuf = replay_buffers.ReplayBuffer(100) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(4), ) agent = agents.IQN( q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, replay_start_size=50, target_update_interval=10**4, update_interval=4, batch_accumulator="mean", phi=lambda x: x, quantile_thresholds_N=64, quantile_thresholds_N_prime=64, quantile_thresholds_K=32, ) downloaded_model, exists = download_model( "IQN", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def test_append_and_terminate(self): capacity = self.capacity num_steps = self.num_steps rbuf = replay_buffers.ReplayBuffer(capacity, num_steps) assert len(rbuf) == 0 # Add one and sample one for _ in range(num_steps): trans1 = dict( state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False, ) rbuf.append(**trans1) assert len(rbuf) == 1 s1 = rbuf.sample(1) assert len(s1) == 1 # Add two and sample two, which must be unique trans2 = dict( state=1, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=True, ) rbuf.append(**trans2) assert len(rbuf) == self.num_steps + 1 s2 = rbuf.sample(self.num_steps + 1) assert len(s2) == self.num_steps + 1 if self.num_steps == 1: if s2[0][0]["state"] == 0: assert s2[1][0]["state"] == 1 else: assert s2[1][0]["state"] == 0 else: for item in s2: # e.g. if states are 0,0,0,1 then buffer looks like: # [[0,0,0], [0, 0, 1], [0, 1], [1]] if len(item) < self.num_steps: assert item[len(item) - 1]["state"] == 1 for i in range(len(item) - 1): assert item[i]["state"] == 0 else: for i in range(len(item) - 1): assert item[i]["state"] == 0
def _test_load_dqn(self, gpu): from pfrl.q_functions import DiscreteActionValueHead n_actions = 4 q_func = nn.Sequential( pnn.LargeAtariCNN(), init_chainer_default(nn.Linear(512, n_actions)), DiscreteActionValueHead(), ) # Use the same hyperparameters as the Nature paper opt = pfrl.optimizers.RMSpropEpsInsideSqrt( q_func.parameters(), lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) rbuf = replay_buffers.ReplayBuffer(100) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(4), ) agent = agents.DQN( q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, replay_start_size=50, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator="sum", phi=lambda x: x, ) downloaded_model, exists = download_model( "DQN", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def test_append_and_sample(self): capacity = self.capacity num_steps = self.num_steps rbuf = replay_buffers.ReplayBuffer(capacity, num_steps) assert len(rbuf) == 0 # Add one and sample one correct_item = collections.deque([], maxlen=num_steps) for _ in range(num_steps): trans1 = dict( state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False, ) correct_item.append(trans1) rbuf.append(**trans1) assert len(rbuf) == 1 s1 = rbuf.sample(1) assert len(s1) == 1 assert s1[0] == list(correct_item) # Add two and sample two, which must be unique correct_item2 = copy.deepcopy(correct_item) trans2 = dict( state=1, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False, ) correct_item2.append(trans2) rbuf.append(**trans2) assert len(rbuf) == 2 s2 = rbuf.sample(2) assert len(s2) == 2 if s2[0][num_steps - 1]["state"] == 0: assert s2[0] == list(correct_item) assert s2[1] == list(correct_item2) else: assert s2[1] == list(correct_item) assert s2[0] == list(correct_item2)
def test_stop_current_episode(self): capacity = self.capacity num_steps = self.num_steps rbuf = replay_buffers.ReplayBuffer(capacity, num_steps) assert len(rbuf) == 0 # Add one and sample one for _ in range(num_steps - 1): trans1 = dict( state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False, ) rbuf.append(**trans1) # we haven't experienced n transitions yet assert len(rbuf) == 0 # episode ends rbuf.stop_current_episode() # episode ends, so we should add n-1 transitions assert len(rbuf) == self.num_steps - 1
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.01) parser.add_argument("--eval-epsilon", type=float, default=0.001) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument( "--arch", type=str, default="doubledqn", choices=["nature", "nips", "dueling", "doubledqn"], ) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) parser.add_argument("--target-update-interval", type=int, default=3 * 10**4) parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--no-clip-delta", dest="clip_delta", action="store_false") parser.set_defaults(clip_delta=True) parser.add_argument("--agent", type=str, default="DoubleDQN", choices=["DQN", "DoubleDQN", "PAL"]) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate") parser.add_argument( "--prioritized", action="store_true", default=False, help="Use prioritized experience replay.", ) parser.add_argument("--num-envs", type=int, default=1) parser.add_argument("--n-step-return", type=int, default=1) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, frame_stack=False, ) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) env.seed(env_seed) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): vec_env = pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) vec_env = pfrl.wrappers.VectorFrameStack(vec_env, 4) return vec_env sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same hyper parameters as the Nature paper's opt = optim.RMSprop( q_func.parameters(), lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.n_step_return, ) else: rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator="sum", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
def main(): import logging logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--env", type=str, default="CartPole-v1") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--final-exploration-steps", type=int, default=1000) parser.add_argument("--start-epsilon", type=float, default=1.0) parser.add_argument("--end-epsilon", type=float, default=0.1) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--steps", type=int, default=10**8) parser.add_argument("--prioritized-replay", action="store_true") parser.add_argument("--replay-start-size", type=int, default=50) parser.add_argument("--target-update-interval", type=int, default=100) parser.add_argument("--target-update-method", type=str, default="hard") parser.add_argument("--soft-update-tau", type=float, default=1e-2) parser.add_argument("--update-interval", type=int, default=1) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--eval-interval", type=int, default=1000) parser.add_argument("--n-hidden-channels", type=int, default=12) parser.add_argument("--n-hidden-layers", type=int, default=3) parser.add_argument("--gamma", type=float, default=0.95) parser.add_argument("--minibatch-size", type=int, default=None) parser.add_argument("--render-train", action="store_true") parser.add_argument("--render-eval", action="store_true") parser.add_argument("--monitor", action="store_true") parser.add_argument("--reward-scale-factor", type=float, default=1.0) args = parser.parse_args() # Set a random seed used in PFRL utils.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) def make_env(test): env = gym.make(args.env) env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor) if (args.render_eval and test) or (args.render_train and not test): env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_size = env.observation_space.low.size action_space = env.action_space n_atoms = 51 v_max = 500 v_min = 0 n_actions = action_space.n q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_atoms, v_min, v_max, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample, ) opt = torch.optim.Adam(q_func.parameters(), 1e-3) rbuf_capacity = 50000 # 5 * 10 ** 5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffers.ReplayBuffer(rbuf_capacity) agent = pfrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.1) parser.add_argument("--eval-epsilon", type=float, default=0.05) parser.add_argument("--steps", type=int, default=10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) parser.add_argument("--target-update-interval", type=int, default=10**4) parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--batch-size", type=int, default=32) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = torch.nn.Sequential( pfrl.nn.LargeAtariCNN(), pfrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction( 512, n_actions, n_atoms, v_min, v_max, n_hidden_channels=0, n_hidden_layers=0, ), ) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = torch.optim.Adam(q_func.parameters(), 2.5e-4, eps=1e-2 / args.batch_size) rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = pfrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator="mean", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, eval_env=eval_env, )
def main(): import logging logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--env", type=str, default="Pendulum-v0") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--final-exploration-steps", type=int, default=10**4) parser.add_argument("--start-epsilon", type=float, default=1.0) parser.add_argument("--end-epsilon", type=float, default=0.1) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--steps", type=int, default=10**5) parser.add_argument("--prioritized-replay", action="store_true") parser.add_argument("--replay-start-size", type=int, default=1000) parser.add_argument("--target-update-interval", type=int, default=10**2) parser.add_argument("--target-update-method", type=str, default="hard") parser.add_argument("--soft-update-tau", type=float, default=1e-2) parser.add_argument("--update-interval", type=int, default=1) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--eval-interval", type=int, default=10**4) parser.add_argument("--n-hidden-channels", type=int, default=100) parser.add_argument("--n-hidden-layers", type=int, default=2) parser.add_argument("--gamma", type=float, default=0.99) parser.add_argument("--minibatch-size", type=int, default=None) parser.add_argument("--render-train", action="store_true") parser.add_argument("--render-eval", action="store_true") parser.add_argument("--monitor", action="store_true") parser.add_argument("--reward-scale-factor", type=float, default=1e-3) parser.add_argument( "--actor-learner", action="store_true", help="Enable asynchronous sampling with asynchronous actor(s)", ) # NOQA parser.add_argument( "--num-envs", type=int, default=1, help=("The number of environments for sampling (only effective with" " --actor-learner enabled)"), ) # NOQA args = parser.parse_args() # Set a random seed used in PFRL utils.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(idx=0, test=False): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed utils.set_random_seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): utils.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor) if (args.render_eval and test) or (args.render_train and not test): env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space, ) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample, ) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() opt = optim.Adam(q_func.parameters()) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffers.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) elif not args.actor_learner: print( "WARNING: Since https://github.com/pfnet/pfrl/pull/112 we have started" " setting `eval_during_episode=True` in this script, which affects the" " timings of evaluation phases.") experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit, eval_during_episode=True, ) else: # using impala mode when given num of envs # When we use multiple envs, it is critical to ensure each env # can occupy a CPU core to get the best performance. # Therefore, we need to prevent potential CPU over-provision caused by # multi-threading in Openmp and Numpy. # Disable the multi-threading on Openmp and Numpy. os.environ["OMP_NUM_THREADS"] = "1" # NOQA ( make_actor, learner, poller, exception_event, ) = agent.setup_actor_learner_training(args.num_envs) poller.start() learner.start() experiments.train_agent_async( processes=args.num_envs, make_agent=make_actor, make_env=make_env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, stop_event=learner.stop_event, exception_event=exception_event, ) poller.stop() learner.stop() poller.join() learner.join()
def _objective_core( # optuna parameters trial, # training parameters env_id, outdir, seed, monitor, gpu, steps, train_max_episode_len, eval_n_episodes, eval_interval, batch_size, # hyperparameters hyperparams, ): # Set a random seed used in PFRL utils.set_random_seed(seed) # Set different random seeds for train and test envs. train_seed = seed test_seed = 2**31 - 1 - seed def make_env(test=False): env = gym.make(env_id) if not isinstance(env.observation_space, gym.spaces.Box): raise ValueError( "Supported only Box observation environments, but given: {}".format( env.observation_space ) ) if len(env.observation_space.shape) != 1: raise ValueError( "Supported only observation spaces with ndim==1, but given: {}".format( env.observation_space.shape ) ) if not isinstance(env.action_space, gym.spaces.Discrete): raise ValueError( "Supported only discrete action environments, but given: {}".format( env.action_space ) ) env_seed = test_seed if test else train_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if monitor: env = pfrl.wrappers.Monitor(env, outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, hyperparams["reward_scale_factor"]) return env env = make_env(test=False) obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space n_actions = action_space.n # create model & q_function model = MLP( in_size=obs_size, out_size=n_actions, hidden_sizes=hyperparams["hidden_sizes"] ) q_func = q_functions.SingleModelStateQFunctionWithDiscreteAction(model=model) # Use epsilon-greedy for exploration start_epsilon = 1 explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=start_epsilon, end_epsilon=hyperparams["end_epsilon"], decay_steps=hyperparams["decay_steps"], random_action_func=action_space.sample, ) opt = optim.Adam( q_func.parameters(), lr=hyperparams["lr"], eps=hyperparams["adam_eps"] ) rbuf_capacity = steps rbuf = replay_buffers.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=gpu, gamma=hyperparams["gamma"], explorer=explorer, replay_start_size=hyperparams["replay_start_size"], target_update_interval=hyperparams["target_update_interval"], update_interval=hyperparams["update_interval"], minibatch_size=batch_size, ) eval_env = make_env(test=True) evaluation_hooks = [OptunaPrunerHook(trial=trial)] _, eval_stats_history = experiments.train_agent_with_evaluation( agent=agent, env=env, steps=steps, eval_n_steps=None, eval_n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, eval_env=eval_env, train_max_episode_len=train_max_episode_len, evaluation_hooks=evaluation_hooks, ) score = _get_score_from_eval_stats_history(eval_stats_history) return score
def make_replay_buffer(self, env): return replay_buffers.ReplayBuffer(10**5, num_steps=3)
def make_replay_buffer(self, env): return replay_buffers.ReplayBuffer(10**5)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="Hopper-v2", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--num-envs", type=int, default=1, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=256, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--policy-output-scale", type=float, default=1.0, help="Weight initialization scale of policy output.", ) parser.add_argument( "--optimizer", type=str, default="AdaBelief", ) args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Unwrap TimiLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) # Normalize action space to [-1, 1]^n env = pfrl.wrappers.NormalizeActionSpace(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size if LooseVersion(torch.__version__) < LooseVersion("1.5.0"): raise Exception("This script requires a PyTorch version >= 1.5.0") def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) def make_optimizer(parameters): if args.optimizer == "OfficialAdaBelief": import adabelief_pytorch optim_class = adabelief_pytorch.AdaBelief optim = optim_class(parameters, betas=(0.9, 0.999), eps=1e-12) else: optim_class = getattr( torch_optimizer, args.optimizer, getattr(torch.optim, args.optimizer, None), ) optim = optim_class(parameters) assert optim_class is not None print(str(optim_class), "with default hyperparameters") return optim policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight, gain=args.policy_output_scale) policy_optimizer = make_optimizer(policy.parameters()) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = make_optimizer(q_func.parameters()) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=3e-4, ) if len(args.load) > 0 or args.load_pretrained: if args.load_pretrained: raise Exception("Pretrained models are currently unsupported.") # either load or load_pretrained must be false assert not len(args.load) > 0 or not args.load_pretrained if len(args.load) > 0: agent.load(args.load) else: agent.load( utils.download_model("SAC", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="RoboschoolAtlasForwardWalk-v1", help="OpenAI Gym env to perform algorithm on.", ) parser.add_argument("--num-envs", type=int, default=4, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=20, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--update-interval", type=int, default=1, help="Interval in timesteps between model updates.", ) parser.add_argument("--batch-size", type=int, default=256, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--monitor", action="store_true", help="Wrap env with Monitor to write videos.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--n-hidden-channels", type=int, default=1024, help="Number of hidden channels of NN models.", ) parser.add_argument("--discount", type=float, default=0.98, help="Discount factor.") parser.add_argument("--n-step-return", type=int, default=3, help="N-step return.") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.") parser.add_argument("--adam-eps", type=float, default=1e-1, help="Adam eps.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, args, process_seeds[idx], test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(args, process_seeds[0], test=False) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print("Observation space:", obs_space) print("Action space:", action_space) del sample_env action_size = action_space.low.size def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(obs_space.low.size, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr, eps=args.adam_eps) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_space.low.size + action_size, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=args.lr, eps=args.adam_eps) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=args.discount, update_interval=args.update_interval, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=args.lr, ) if len(args.load) > 0: agent.load(args.load) if args.demo: eval_env = make_env(args, seed=0, test=True) eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, )
def __init__(self, buffer_size): self._idx = 0 # 次にデータを挿入するインデックス. self._size = 0 # データ数. self.buffer_size = buffer_size # リプレイバッファのサイズ. self.buf = replay_buffers.ReplayBuffer(capacity=self.buffer_size) self.dev = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=( "Directory path to save output files." " If it does not exist, it will be created." ), ) parser.add_argument( "--env", type=str, default="'DClawTurnFixed-v0'", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( "--gpu", type=int, default=-1, help="GPU to use, set to -1 if no GPU." ) parser.add_argument( "--load", type=str, default="", help="Directory to load agent from." ) parser.add_argument( "--max-steps", type=int, default=10 ** 6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=64, help="Minibatch size") parser.add_argument( "--render", action="store_true", help="Render env states in a GUI window." ) parser.add_argument( "--demo", action="store_true", help="Just run evaluation, not training." ) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument( "--pretrained-type", type=str, default="best", choices=["best", "final"] ) parser.add_argument( "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor." ) parser.add_argument( "--log-level", type=int, default=logging.INFO, help="Level of the root logger." ) parser.add_argument("--gamma", type=float, default=0.9) parser.add_argument("--ddpg-training-steps", type=int, default=int(1e3)) parser.add_argument("--adversary-training-steps", type=int,default=int(1e3)) args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = './results' print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) def make_env(test): env = gym.make('DClawTurnFixed-v0') # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size q_func = nn.Sequential( ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256,256), nn.ReLU(), nn.Linear(256, 1), ) policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256,256), nn.ReLU(), nn.Linear(256, action_size), BoundByTanh(low=action_space.low, high=action_space.high), DeterministicHead(), ) ddpg_opt_a = torch.optim.Adam(policy.parameters()) ddpg_opt_c = torch.optim.Adam(q_func.parameters()) ddpg_rbuf = replay_buffers.ReplayBuffer(10 ** 6) ddpg_explorer = explorers.AdditiveGaussian( scale=0.1, low=action_space.low, high=action_space.high ) def ddpg_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 ddpg_agent = DDPG( policy, q_func, ddpg_opt_a, ddpg_opt_c, ddpg_rbuf, gamma=args.gamma, explorer=ddpg_explorer, replay_start_size=args.replay_start_size, target_update_method="soft", target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=ddpg_burnin_action_func, ) def adversary_random_func(): return np.random.randint(0,9) # adversary_q = Critic(obs_size, 1, hidden_size=adversary_hidden_size) # adversary_action_space = gym.spaces.discrete.Discrete(9) # adversary_q = q_functions.FCQuadraticStateQFunction( # obs_size, 1, n_hidden_channels = 256, n_hidden_layers = 2,action_space = adversary_action_space # ) adversary_q = nn.Sequential( nn.Linear(obs_size, 256), nn.Linear(256,256), nn.Linear(256,256), nn.Linear(256,1), DiscreteActionValueHead(), ) adversary_optimizer = torch.optim.Adam(adversary_q.parameters(), lr=1e-3) adversary_rbuf_capacity = int(1e6) adversary_rbuf = replay_buffers.ReplayBuffer(adversary_rbuf_capacity) adversary_explorer = explorers.LinearDecayEpsilonGreedy( 1.0, 0.1, 10**4, adversary_random_func ) adversary_agent = DQN( adversary_q, adversary_optimizer, adversary_rbuf, gpu=args.gpu, gamma=args.gamma, explorer=adversary_explorer, replay_start_size=args.replay_start_size, target_update_interval=1, minibatch_size=args.batch_size, target_update_method='soft', soft_update_tau=5e-3 ) logger = logging.getLogger(__name__) eval_env = make_env(test=True) evaluator = Evaluator( agent=ddpg_agent, n_steps=None, n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, max_episode_len=timestep_limit, env=eval_env, step_offset=0, save_best_so_far_agent=True, use_tensorboard=True, logger=logger, ) episode_reward = 0 ddpg_episode_idx = 0 adversary_episode_idx = 0 # o_0, r_0 current_state = env.reset() t = 0 ddpg_t = 0 adversary_t = 0 episode_len = 0 try: while t < args.max_steps: for i in range(args.ddpg_training_steps): t += 1 ddpg_t += 1 ddpg_action = ddpg_agent.act(current_state) adversary_action = adversary_agent.act(current_state) ddpg_action[adversary_action] = 0 next_state, reward, done, info = env.step(ddpg_action) episode_reward += reward episode_len += 1 reset = episode_len == timestep_limit or info.get("needs_reset", False) ddpg_agent.observe(next_state, reward, done, reset) current_state = next_state if done or reset or t == args.max_steps: logger.info( "ddpg phase: outdir:%s step:%s episode:%s R:%s", args.outdir, ddpg_t, ddpg_episode_idx, episode_reward, ) logger.info("statistics:%s", ddpg_agent.get_statistics()) if evaluator is not None: evaluator.evaluate_if_necessary(t=t, episodes=ddpg_episode_idx + 1) if t == args.max_steps: break episode_reward = 0 ddpg_episode_idx += 1 episode_len = 0 current_state = env.reset() episode_reward = 0 episode_len = 0 current_state = env.reset() print("start adversary training ") for i in range(args.adversary_training_steps): t += 1 adversary_t += 1 ddpg_action = ddpg_agent.act(current_state) adversary_action = adversary_agent.act(current_state) ddpg_action[adversary_action] = 0 next_state, reward, done, info = env.step(ddpg_action) reward = -reward episode_len += 1 reset = episode_len == timestep_limit or info.get("needs_reset", False) adversary_agent.observe(next_state, reward, done, reset) current_state = next_state if done or reset or t == args.max_steps: if t == args.max_steps: break episode_reward = 0 adversary_episode_idx += 1 episode_len = 0 current_state = env.reset() except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_except") save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_except" ) raise # Save the final model save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_finish") save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_finish" ) # if args.demo: # eval_env.render() # eval_stats = experiments.eval_performance( # env=eval_env, # agent=ddpg_agent, # n_steps=None, # n_episodes=args.eval_n_runbase_envs, # max_episode_len=timestep_limit, # ) # print( # "n_runs: {} mean: {} median: {} stdev {}".format( # args.eval_n_runs, # eval_stats["mean"], # eval_stats["median"], # eval_stats["stdev"], # ) # ) # else: # experiments.train_agent_with_evaluation( # agent=ddpg_agent, # env=env, # steps=args.steps, # eval_env=eval_env, # eval_n_steps=None, # eval_n_episodes=args.eval_n_runs, # eval_interval=args.eval_interval, # outdir=args.outdir, # train_max_episode_len=timestep_limit, # ) print("finish")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--load", type=str, default=None) parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.01) parser.add_argument("--eval-epsilon", type=float, default=0.001) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) parser.add_argument("--target-update-interval", type=int, default=10**4) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--batch-size", type=int, default=32) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--batch-accumulator", type=str, default="mean", choices=["mean", "sum"]) parser.add_argument("--quantile-thresholds-N", type=int, default=64) parser.add_argument("--quantile-thresholds-N-prime", type=int, default=64) parser.add_argument("--quantile-thresholds-K", type=int, default=32) parser.add_argument("--n-best-episodes", type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = pfrl.agents.iqn.ImplicitQuantileQFunction( psi=nn.Sequential( nn.Conv2d(4, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, 3, stride=1), nn.ReLU(), nn.Flatten(), ), phi=nn.Sequential( pfrl.agents.iqn.CosineBasisLinear(64, 3136), nn.ReLU(), ), f=nn.Sequential( nn.Linear(3136, 512), nn.ReLU(), nn.Linear(512, n_actions), ), ) # Use the same hyper parameters as https://arxiv.org/abs/1710.10044 opt = torch.optim.Adam(q_func.parameters(), lr=5e-5, eps=1e-2 / args.batch_size) rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = pfrl.agents.IQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator=args.batch_accumulator, phi=phi, quantile_thresholds_N=args.quantile_thresholds_N, quantile_thresholds_N_prime=args.quantile_thresholds_N_prime, quantile_thresholds_K=args.quantile_thresholds_K, ) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( utils.download_model("IQN", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None, ) print("n_steps: {} mean: {} median: {} stdev {}".format( args.eval_n_steps, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None, ) with open(os.path.join(args.outdir, "bestscores.json"), "w") as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--env", type=str, default="BreakoutNoFrameskip-v4", help="OpenAI Atari domain to perform algorithm on.", ) parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--load", type=str, default=None) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument( "--steps", type=int, default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--replay-start-size", type=int, default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument("--n-best-episodes", type=int, default=30) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=None), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = nn.Sequential( pnn.LargeAtariCNN(), init_chainer_default(nn.Linear(512, n_actions)), DiscreteActionValueHead(), ) # Use the same hyperparameters as the Nature paper opt = pfrl.optimizers.RMSpropEpsInsideSqrt( q_func.parameters(), lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.DQN agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator="sum", phi=phi, ) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( utils.download_model("DQN", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_episodes: {} mean: {} median: {} stdev {}".format( eval_stats["episodes"], eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 30 evaluation episodes, each capped at 5 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=4500, logger=None, ) with open(os.path.join(args.outdir, "bestscores.json"), "w") as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def train_PFRL_agent(): policy = make_policy().to(dev) q_func1 = Q_Net().to(dev) q_func2 = Q_Net().to(dev) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=3e-4) q_func1_optimizer = torch.optim.Adam(q_func1.parameters(), lr=3e-4) q_func2_optimizer = torch.optim.Adam(q_func2.parameters(), lr=3e-4) gamma = 0.99 gpu = -1 replay_start_size = 5 * 10 ** 3 minibatch_size = 256 max_grad_norm = 0.5 update_interval = 1 replay_buffer = replay_buffers.ReplayBuffer(5 * 10 ** 3) def burn_in_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(-1, 1, size=2).astype(np.float32) print(torch.cuda.is_available()) agent = pfrl.agents.SoftActorCritic(policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma, gpu, replay_start_size, minibatch_size, update_interval, max_grad_norm, temperature_optimizer_lr=3e-4, burnin_action_func=burn_in_action_func) env = make_env() # env1 = make_batch_env(False, env) # env2 = make_batch_env(True, env) # experiments.train_agent_batch_with_evaluation( # agent=agent, # env=env, # eval_env=env, # outdir="./", # steps=3 * 10 ** 6, # eval_n_steps=None, # eval_n_episodes=2, # eval_interval=2 * 10 ** 3, # log_interval=10, # max_episode_len=None, # ) eval_interval = 2 * 10 ** 1 policy_start_step = 5 * 10 ** 3 state = env.reset() for i in tqdm(range(3*10**6)): if i // eval_interval == 0 and i is not 0: with agent.eval_mode(): state = env.reset() r_sum = 0 while True: act = agent.act(state) n_state, rew, done, info = env.step(act) r_sum += rew if done: print("step {}: rew is {}.".format(i, r_sum)) state = env.reset() break act = agent.act(state) print("act {}".format(act)) n_state, rew, done, info = env.step(act) agent.observe(n_state, rew, done, done) if done: state = env.reset()
def train_PFRL_agent(): env = make_env() env = pfrl.wrappers.CastObservationToFloat32(env) env = pfrl.wrappers.NormalizeActionSpace(env) policy = make_policy(env.state_shape, env.action_space.shape).to(dev) q_func1 = Q_Net(env.state_shape, env.action_space.shape).to(dev) q_func2 = Q_Net(env.state_shape, env.action_space.shape).to(dev) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=3e-4) q_func1_optimizer = torch.optim.Adam(q_func1.parameters(), lr=3e-4) q_func2_optimizer = torch.optim.Adam(q_func2.parameters(), lr=3e-4) gamma = 0.99 gpu = -1 replay_start_size = 5 * 10**1 # replay_start_size = 0 minibatch_size = 1 max_grad_norm = 0.5 update_interval = 1 replay_buffer = replay_buffers.ReplayBuffer(10**6) def burn_in_action_func(): """Select random actions until model is updated one or more times.""" print("burn in action func") ans = np.random.uniform(-1, 1, size=2).astype(np.float32) print("ans shape {}, ans type {}".format(ans.shape, type(ans))) return ans print(torch.cuda.is_available()) agent = pfrl.agents.SoftActorCritic(policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma, gpu, replay_start_size, minibatch_size, update_interval, max_grad_norm, temperature_optimizer_lr=3e-4, burnin_action_func=burn_in_action_func) eval_interval = 2 * 10**1 policy_start_step = 5 * 10**1 # experiments.train_agent_with_evaluation( # agent=agent, # env=env, # steps=3*10**6, # eval_n_steps=100, # eval_n_episodes=None, # eval_interval=1, # outdir="./", # save_best_so_far_agent=True, # eval_env=env, # ) state = env.reset() state_test = state.reshape(1, -1) print("state shape {}, type {}".format(state.shape, type(state))) print("state_test shape {}, type {}".format(state_test.shape, type(state_test))) with agent.eval_mode(): agent.act(state_test) for i in tqdm(range(3 * 10**6)): if i // eval_interval == 0 and i != 0: with agent.eval_mode(): state = env.reset() state = torch.from_numpy(state).to(dev) r_sum = 0 while True: act = agent.act(state) n_state, rew, done, info = env.step(act) r_sum += rew state = torch.from_numpy(n_state).to(dev) if done: print("step {}: rew is {}.".format(i, r_sum)) state = env.reset() break # if i < policy_start_step: # act = env.action_space.sample() # else: act = agent.act(state) print("act {}".format(act)) n_state, rew, done, info = env.step(act) agent.observe(n_state, rew, done, done) if done: state = env.reset()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="Hopper-v2", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=100, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor.") parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) def make_env(test): env = gym.make(args.env) # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size policy = nn.Sequential( nn.Linear(obs_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, action_size), nn.Tanh(), pfrl.policies.DeterministicHead(), ) policy_optimizer = torch.optim.Adam(policy.parameters()) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters()) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.AdditiveGaussian(scale=0.1, low=action_space.low, high=action_space.high) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.TD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, ) if len(args.load) > 0 or args.load_pretrained: # either load or load_pretrained must be false assert not len(args.load) > 0 or not args.load_pretrained if len(args.load) > 0: agent.load(args.load) else: agent.load( utils.download_model("TD3", args.env, model_type=args.pretrained_type)[0]) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) import json import os with open(os.path.join(args.outdir, "demo_scores.json"), "w") as f: json.dump(eval_stats, f) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_env=eval_env, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, train_max_episode_len=timestep_limit, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--env", type=str, default="BreakoutNoFrameskip-v4", help="OpenAI Atari domain to perform algorithm on.", ) parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument( "--final-exploration-frames", type=int, default=10**6, help="Timesteps after which we stop " + "annealing exploration rate", ) parser.add_argument( "--final-epsilon", type=float, default=0.01, help="Final value of epsilon during training.", ) parser.add_argument( "--eval-epsilon", type=float, default=0.001, help="Exploration epsilon used during eval episodes.", ) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument( "--arch", type=str, default="doubledqn", choices=["nature", "nips", "dueling", "doubledqn"], help="Network architecture to use.", ) parser.add_argument( "--steps", type=int, default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument( "--replay-start-size", type=int, default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, default=3 * 10**4, help="Frequency (in timesteps) at which " + "the target network is updated.", ) parser.add_argument( "--eval-interval", type=int, default=10**5, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( "--update-interval", type=int, default=4, help="Frequency (in timesteps) of network updates.", ) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--no-clip-delta", dest="clip_delta", action="store_false") parser.add_argument("--num-step-return", type=int, default=1) parser.set_defaults(clip_delta=True) parser.add_argument("--agent", type=str, default="DoubleDQN", choices=["DQN", "DoubleDQN", "PAL"]) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate.") parser.add_argument( "--prioritized", action="store_true", default=False, help="Use prioritized experience replay.", ) parser.add_argument( "--checkpoint-frequency", type=int, default=None, help="Frequency at which agents are stored.", ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() else: explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) # Use the Nature paper's hyperparameters opt = pfrl.optimizers.RMSpropEpsInsideSqrt( q_func.parameters(), lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.num_step_return, ) else: rbuf = replay_buffers.ReplayBuffer(10**6, args.num_step_return) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator="sum", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, checkpoint_freq=args.checkpoint_frequency, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, eval_env=eval_env, )
def main(): if LooseVersion(torch.__version__) < LooseVersion("1.5.0"): raise Exception("This script requires a PyTorch version >= 1.5.0") parser = argparse.ArgumentParser() parser.add_argument('-w', '--weight_dir', type=str, default='', help='path to trained') parser.add_argument('-s', '--step_to_load', type=int, default=0, help='step checkpoint to load') parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)') args = parser.parse_args() weight_dir = args.weight_dir step_to_load = args.step_to_load task_path = os.path.dirname(os.path.realpath(__file__)) rsc_path = task_path + "/../rsc" save_path = os.path.join( weight_dir, 'testing_' + str(step_to_load), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) os.makedirs(save_path) for file in os.listdir(weight_dir): if file.startswith('cfg_sac'): cfg_abs_path = weight_dir + '/' + file # config cfg = YAML().load(open(cfg_abs_path, 'r')) cfg['environment']['num_envs'] = 1 cfg['environment']['num_threads'] = 1 cfg['environment']['control_dt'] = cfg['testing']['control_dt'] cfg['environment']['render'] = cfg['testing']['render'] impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper)) env = VecEnvPython(impl) obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) # seeding seed = cfg['environment']['seed'] torch.manual_seed(seed) utils.set_random_seed(seed) # Set a random seed used in PFRL obs_size = obs_space.low.size action_size = action_space.low.size def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=cfg['algorithm']['learning_rate']) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam( q_func.parameters(), lr=cfg['algorithm']['learning_rate']) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size']) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=cfg['algorithm']['discount_factor'], replay_start_size=cfg['algorithm']['replay_start_size'], gpu=args.gpu, minibatch_size=cfg['algorithm']['minibatch_size'], burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'], ) agent.load(weight_dir + '/' + str(step_to_load) + '_checkpoint') if cfg['testing']['render']: env.wrapper.showWindow() if cfg['testing']['record_video']: env.start_recording_video(save_path + '/test.mp4') test_steps = int(cfg['testing']['seconds'] / cfg['testing']['control_dt']) torch.manual_seed(cfg['environment']['seed']) act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32) _, _, _, new_info = env.step(act, visualize=cfg['testing']['render']) ob = env.reset() try: for i in range(test_steps): if i % 100 == 0: env.reset() with agent.eval_mode(): agent.act_deterministically = True act = agent.batch_act(ob) ob, rew, done, info = env.step(act, visualize=cfg['testing']['render']) except KeyboardInterrupt: pass finally: if cfg['testing']['record_video']: env.stop_recording_video()
def _test_load_sac(self, gpu): obs_size = 11 action_size = 3 def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) from torch import distributions base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)], ) from pfrl.nn.lmbda import Lambda policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=3e-4) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=3e-4) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() agent = agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffers.ReplayBuffer(100), gamma=0.99, replay_start_size=1000, gpu=gpu, minibatch_size=256, burnin_action_func=None, entropy_target=-3, temperature_optimizer_lr=3e-4, ) downloaded_model, exists = download_model( "SAC", "Hopper-v2", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--num-envs", type=int, default=1, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=2500, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--update-interval", type=int, default=1, help="Interval in timesteps between model updates.", ) parser.add_argument("--batch-size", type=int, default=100, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--monitor", action="store_true", help="Wrap env with Monitor to write videos.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--n-hidden-channels", type=int, default=256, help="Number of hidden channels of NN models.", ) parser.add_argument( "--env", default="AntMaze", help= "Type of Ant Env to use. Options are AntMaze, AntFall, and AntPush.", type=str) parser.add_argument("--discount", type=float, default=0.99, help="Discount factor.") parser.add_argument("--n-step-return", type=int, default=3, help="N-step return.") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.") parser.add_argument("--adam-eps", type=float, default=1e-1, help="Adam eps.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_ant_env(idx, test): # use different seeds for train vs test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed # env_seed = np.random.randint(0, 2**32 - 1) if not test else process_seed print('seed', env_seed) utils.set_random_seed(env_seed) # create the anv environment with goal env = AntEnvWithGoal(create_maze_env(args.env), args.env, env_subgoal_dim=15) env.seed(int(env_seed)) if args.render: env = pfrl.wrappers.GymLikeEnvRender(env, mode='human') return env eval_env = make_ant_env(0, test=True) env_state_dim = eval_env.state_dim env_action_dim = eval_env.action_dim if args.env == 'AntMaze' or args.env == 'AntPush': env_goal_dim = 2 else: env_goal_dim = 3 action_size = env_action_dim action_space = eval_env.action_space scale_low = action_space.high * np.ones(env_action_dim) def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(env_state_dim + env_goal_dim, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=0.0001) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(env_state_dim + env_goal_dim + env_action_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=0.001) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(200000) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) if args.gpu is not None and args.gpu >= 0: assert torch.cuda.is_available() device = torch.device("cuda:{}".format(args.gpu)) else: device = torch.device("cpu") # Hyperparameters in http://arxiv.org/abs/1802.09477 scale_tensor = torch.tensor(scale_low).float().to(device) agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=args.discount, update_interval=args.update_interval, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=args.lr, scale=scale_tensor) if len(args.load) > 0: agent.load(args.load) if args.demo: eval_env = make_env(args, seed=0, test=True) eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_goal_conditioned_agent_with_evaluation( agent=agent, env=make_ant_env(0, test=False), steps=args.steps, eval_n_steps=None, outdir=args.outdir, eval_n_episodes=args.eval_n_runs, eval_interval=5000, use_tensorboard=True, )