def objective(trial): # suggest parameters from Optuna hyperparams = suggest(trial, args.steps) # seed is generated for each objective seed = randomizer.randint(0, 2**31 - 1) additional_args = dict(seed=seed, **hyperparams) outdir = experiments.prepare_output_dir(args=additional_args, basedir=rootdir) print("Output files are saved in {}".format(outdir)) return _objective_core( # optuna parameters trial=trial, # training parameters env_id=args.env, outdir=outdir, seed=seed, monitor=args.monitor, gpu=args.gpu, steps=args.steps, train_max_episode_len=args.train_max_episode_len, eval_n_episodes=args.eval_n_episodes, eval_interval=args.eval_interval, batch_size=args.batch_size, # hyperparameters hyperparams=hyperparams, )
def main(): args = parse_rl_args() cfg = YAML().load( open(os.environ["FLIGHTMARE_PATH"] + "/flightlib/configs/vec_env.yaml", 'r')) if not args.train: cfg["env"]["num_envs"] = 1 cfg["env"]["num_threads"] = 1 if args.render: cfg["env"]["render"] = "yes" else: cfg["env"]["render"] = "no" logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # set random seed # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_drone_env(cfg, idx, test): # use different seeds for train vs test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed # env_seed = np.random.randint(0, 2**32 - 1) if not test else process_seed utils.set_random_seed(env_seed) # create the anv environment with goal env = wrapper.FlightEnvVec( QuadrotorGoalConditionedEnv_v1(dump(cfg, Dumper=RoundTripDumper), False)) env.seed(env_seed) if args.render: env = pfrl.wrappers.GymLikeEnvRender(env) return env eval_env = make_drone_env(cfg, 0, test=True) env_state_dim = eval_env.state_dim env_action_dim = eval_env.action_dim env_subgoal_dim = 12 # env_subgoal_dim = eval_env.subgoal_dim env_goal_dim = eval_env.obs_dim action_space = eval_env.action_space subgoal_space = eval_env.subgoal_space scale_low = action_space.high * np.ones(env_action_dim) # create subgoal space in env! scale_high = subgoal_space.high * np.ones(env_subgoal_dim) print(action_space.high, action_space.low) def low_level_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) def high_level_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(subgoal_space.low, subgoal_space.high).astype(np.float32) gpu = 0 if torch.cuda.is_available() else None agent = HIROAgent( state_dim=env_state_dim, action_dim=env_action_dim, goal_dim=env_goal_dim, subgoal_dim=env_subgoal_dim, high_level_burnin_action_func=high_level_burnin_action_func, low_level_burnin_action_func=low_level_burnin_action_func, scale_low=scale_low, scale_high=scale_high, buffer_size=200000, subgoal_freq=10, train_freq=10, reward_scaling=0.1, goal_threshold=5, gpu=gpu, add_entropy=args.add_entropy) print(args.add_entropy) if args.load: # load weights from a file if arg supplied agent.load(args.load) if args.record: from mujoco_py import GlfwContext GlfwContext(offscreen=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, video_outdir=args.outdir, step_number=-1 if args.recordd else None # justNonNoneObjectForRecording ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # train the hierarchical agent experiments.train_hrl_agent_with_evaluation(agent=agent, env=make_drone_env( cfg, 0, test=False), steps=args.steps, outdir=args.outdir, eval_n_steps=None, eval_interval=5000, eval_n_episodes=10, use_tensorboard=True, record=args.record)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--load", type=str, default=None) parser.add_argument("--eval-epsilon", type=float, default=0.0) parser.add_argument("--noisy-net-sigma", type=float, default=0.5) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=2 * 10**4) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--n-best-episodes", type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = DistributionalDuelingDQN( n_actions, n_atoms, v_min, v_max, ) # Noisy nets pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same hyper parameters as https://arxiv.org/abs/1710.02298 opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = args.steps / update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10**6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=3, normalize_by_max="memory", ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.CategoricalDoubleDQN agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, minibatch_size=32, replay_start_size=args.replay_start_size, target_update_interval=32000, update_interval=update_interval, batch_accumulator="mean", phi=phi, ) if args.load or args.load_pretrained: # either load_ or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( utils.download_model("Rainbow", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_episodes: {} mean: {} median: {} stdev {}".format( eval_stats["episodes"], eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None, ) with open(os.path.join(args.outdir, "bestscores.json"), "w") as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--processes", type=int, default=16) parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--t-max", type=int, default=5) parser.add_argument("--beta", type=float, default=1e-2) parser.add_argument("--profile", action="store_true") parser.add_argument("--steps", type=int, default=8 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env sample_env = make_env(0, False) obs_size = sample_env.observation_space.low.shape[0] n_actions = sample_env.action_space.n model = nn.Sequential( nn.Conv2d(obs_size, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), pfrl.nn.Branched( nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), nn.Linear(256, 1), ), ) # SharedRMSprop is same as torch.optim.RMSprop except that it initializes # its state in __init__, allowing it to be moved to shared memory. opt = SharedRMSpropEpsInsideSqrt(model.parameters(), lr=7e-4, eps=1e-1, alpha=0.99) assert opt.state_dict()["state"], ( "To share optimizer state across processes, the state must be" " initialized before training.") def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C( model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi, max_grad_norm=40.0, ) if args.load_pretrained: raise Exception("Pretrained models are currently unsupported.") if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_steps: {} mean: {} median: {} stdev: {}".format( args.eval_n_steps, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): for pg in agent.optimizer.param_groups: assert "lr" in pg pg["lr"] = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=True, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--env", type=str, default="BreakoutNoFrameskip-v4", help="OpenAI Atari domain to perform algorithm on.", ) parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument( "--final-exploration-frames", type=int, default=10**6, help="Timesteps after which we stop " + "annealing exploration rate", ) parser.add_argument( "--final-epsilon", type=float, default=0.01, help="Final value of epsilon during training.", ) parser.add_argument( "--eval-epsilon", type=float, default=0.001, help="Exploration epsilon used during eval episodes.", ) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument( "--arch", type=str, default="doubledqn", choices=["nature", "nips", "dueling", "doubledqn"], help="Network architecture to use.", ) parser.add_argument( "--steps", type=int, default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument( "--replay-start-size", type=int, default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, default=3 * 10**4, help="Frequency (in timesteps) at which " + "the target network is updated.", ) parser.add_argument( "--eval-interval", type=int, default=10**5, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( "--update-interval", type=int, default=4, help="Frequency (in timesteps) of network updates.", ) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--no-clip-delta", dest="clip_delta", action="store_false") parser.add_argument("--num-step-return", type=int, default=1) parser.set_defaults(clip_delta=True) parser.add_argument("--agent", type=str, default="DoubleDQN", choices=["DQN", "DoubleDQN", "PAL"]) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate.") parser.add_argument( "--prioritized", action="store_true", default=False, help="Use prioritized experience replay.", ) parser.add_argument( "--checkpoint-frequency", type=int, default=None, help="Frequency at which agents are stored.", ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() else: explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) # Use the Nature paper's hyperparameters opt = pfrl.optimizers.RMSpropEpsInsideSqrt( q_func.parameters(), lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.num_step_return, ) else: rbuf = replay_buffers.ReplayBuffer(10**6, args.num_step_return) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator="sum", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, checkpoint_freq=args.checkpoint_frequency, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, eval_env=eval_env, )
def main(): import logging logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--env", type=str, default="Pendulum-v0") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--final-exploration-steps", type=int, default=10**4) parser.add_argument("--start-epsilon", type=float, default=1.0) parser.add_argument("--end-epsilon", type=float, default=0.1) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--steps", type=int, default=10**5) parser.add_argument("--prioritized-replay", action="store_true") parser.add_argument("--replay-start-size", type=int, default=1000) parser.add_argument("--target-update-interval", type=int, default=10**2) parser.add_argument("--target-update-method", type=str, default="hard") parser.add_argument("--soft-update-tau", type=float, default=1e-2) parser.add_argument("--update-interval", type=int, default=1) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--eval-interval", type=int, default=10**4) parser.add_argument("--n-hidden-channels", type=int, default=100) parser.add_argument("--n-hidden-layers", type=int, default=2) parser.add_argument("--gamma", type=float, default=0.99) parser.add_argument("--minibatch-size", type=int, default=None) parser.add_argument("--render-train", action="store_true") parser.add_argument("--render-eval", action="store_true") parser.add_argument("--monitor", action="store_true") parser.add_argument("--reward-scale-factor", type=float, default=1e-3) parser.add_argument( "--actor-learner", action="store_true", help="Enable asynchronous sampling with asynchronous actor(s)", ) # NOQA parser.add_argument( "--num-envs", type=int, default=1, help=("The number of environments for sampling (only effective with" " --actor-learner enabled)"), ) # NOQA args = parser.parse_args() # Set a random seed used in PFRL utils.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(idx=0, test=False): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed utils.set_random_seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): utils.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor) if (args.render_eval and test) or (args.render_train and not test): env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space, ) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample, ) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() opt = optim.Adam(q_func.parameters()) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffers.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) elif not args.actor_learner: print( "WARNING: Since https://github.com/pfnet/pfrl/pull/112 we have started" " setting `eval_during_episode=True` in this script, which affects the" " timings of evaluation phases.") experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit, eval_during_episode=True, ) else: # using impala mode when given num of envs # When we use multiple envs, it is critical to ensure each env # can occupy a CPU core to get the best performance. # Therefore, we need to prevent potential CPU over-provision caused by # multi-threading in Openmp and Numpy. # Disable the multi-threading on Openmp and Numpy. os.environ["OMP_NUM_THREADS"] = "1" # NOQA ( make_actor, learner, poller, exception_event, ) = agent.setup_actor_learner_training(args.num_envs) poller.start() learner.start() experiments.train_agent_async( processes=args.num_envs, make_agent=make_actor, make_env=make_env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, stop_event=learner.stop_event, exception_event=exception_event, ) poller.stop() learner.stop() poller.join() learner.join()
def main(): import logging torch.cuda.empty_cache() parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU') parser.add_argument('--env', type=str, default='LidarBat-v0', help='Bat simulation env') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='data/ppo', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--num-envs", type=int, default=1, help="Number of envs run in parallel.") parser.add_argument("--batch-size", type=int, default=64, help="Minibatch size") parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) # TODO # if not test is not here if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print("Observation space:", obs_space) print("Action space:", action_space) assert isinstance(action_space, gym.spaces.Box) # Normalize observations based on their empirical mean and variance obs_normalizer = pfrl.nn.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) # pulicy here magic number must be concidered again obs_size = obs_space.low.size action_size = action_space.low.size policy = torch.nn.Sequential( nn.Linear(obs_size, 64), nn.Tanh(), nn.Linear(64, 64), nn.Tanh(), nn.Linear(64, action_size), pfrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type="diagonal", var_func=lambda x: torch.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) vf = torch.nn.Sequential( nn.Linear(obs_size, 64), nn.Tanh(), nn.Linear(64, 64), nn.Tanh(), nn.Linear(64, 1), ) # While the original paper initialized weights by normal distribution, # we use orthogonal initialization as the latest openai/baselines does. def ortho_init(layer, gain): nn.init.orthogonal_(layer.weight, gain=gain) nn.init.zeros_(layer.bias) ortho_init(policy[0], gain=1) ortho_init(policy[2], gain=1) ortho_init(policy[4], gain=1e-2) ortho_init(vf[0], gain=1) ortho_init(vf[2], gain=1) ortho_init(vf[4], gain=1) # Combine a policy and a value function into a single model model = pfrl.nn.Branched(policy, vf) opt = torch.optim.Adam(model.parameters(), lr=args.lr, eps=1e-5) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batch_size, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=True, gamma=0.995, lambd=0.97, ) if args.load or args.load_pretrained: if args.load_pretrained: raise Exception("Pretrained models are currently unsupported.") # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( utils.download_model("PPO", args.env, model_type="final")[0]) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, save_best_so_far_agent=False, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="Hopper-v2", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--num-envs", type=int, default=1, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=256, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--policy-output-scale", type=float, default=1.0, help="Weight initialization scale of policy output.", ) parser.add_argument( "--optimizer", type=str, default="AdaBelief", ) args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Unwrap TimiLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) # Normalize action space to [-1, 1]^n env = pfrl.wrappers.NormalizeActionSpace(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size if LooseVersion(torch.__version__) < LooseVersion("1.5.0"): raise Exception("This script requires a PyTorch version >= 1.5.0") def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) def make_optimizer(parameters): if args.optimizer == "OfficialAdaBelief": import adabelief_pytorch optim_class = adabelief_pytorch.AdaBelief optim = optim_class(parameters, betas=(0.9, 0.999), eps=1e-12) else: optim_class = getattr( torch_optimizer, args.optimizer, getattr(torch.optim, args.optimizer, None), ) optim = optim_class(parameters) assert optim_class is not None print(str(optim_class), "with default hyperparameters") return optim policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight, gain=args.policy_output_scale) policy_optimizer = make_optimizer(policy.parameters()) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = make_optimizer(q_func.parameters()) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=3e-4, ) if len(args.load) > 0 or args.load_pretrained: if args.load_pretrained: raise Exception("Pretrained models are currently unsupported.") # either load or load_pretrained must be false assert not len(args.load) > 0 or not args.load_pretrained if len(args.load) > 0: agent.load(args.load) else: agent.load( utils.download_model("SAC", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument( "--demo", action="store_true", default=False, help="Evaluate the agent without training.", ) parser.add_argument( "--load", type=str, default=None, help="Load a saved agent from a given directory.", ) parser.add_argument( "--final-exploration-steps", type=int, default=5 * 10**5, help="Timesteps after which we stop annealing exploration rate", ) parser.add_argument( "--final-epsilon", type=float, default=0.2, help="Final value of epsilon during training.", ) parser.add_argument( "--steps", type=int, default=2 * 10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--replay-start-size", type=int, default=5 * 10**4, help="Minimum replay buffer size before performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, default=1 * 10**4, help="Frequency (in timesteps) at which the target network is updated.", ) parser.add_argument( "--eval-interval", type=int, default=10**5, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( "--update-interval", type=int, default=1, help="Frequency (in timesteps) of network updates.", ) parser.add_argument( "--eval-n-runs", type=int, default=100, help="Number of episodes used for evaluation.", ) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--num-envs", type=int, default=1, help="Number of envs run in parallel.") parser.add_argument("--batch-size", type=int, default=32, help="Batch size used for training.") parser.add_argument( "--record", action="store_true", default=False, help= "Record videos of evaluation envs. --render should also be specified.", ) parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor.") args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_panda_env(idx, test): from pybullet_robot_envs.envs.panda_envs.panda_push_gym_goal_env import ( pandaPushGymGoalEnv) # NOQA # use different seeds for train vs test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed utils.set_random_seed(env_seed) env = pandaPushGymGoalEnv( renders=args.render and (args.demo or not test)) env.seed(int(env_seed)) if test and args.record: assert args.render, "To use --record, --render needs be specified." video_dir = os.path.join(args.outdir, "video_{}".format(idx)) os.mkdir(video_dir) env = RecordMovie(env, video_dir) return env def make_batch_panda_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_panda_env, idx, test) for idx in range(args.num_envs) ]) # eval_env = make_batch_panda_env(test=True) eval_env = make_panda_env(0, test=True) env_state_dim = eval_env.observation_space.spaces['observation'].shape[0] env_action_dim = eval_env.action_space.shape[0] env_subgoal_dim = 5 subgoal_space = gym.spaces.Box(-1, 1, (env_subgoal_dim, )) env_goal_dim = eval_env.observation_space['desired_goal'].shape[0] gpu = 0 if torch.cuda.is_available() else None agent = HIROAgent(state_dim=env_state_dim, action_dim=env_action_dim, goal_dim=env_goal_dim, subgoal_dim=env_subgoal_dim, scale_low=1, start_training_steps=100, model_save_freq=10, model_path='model', buffer_size=200000, batch_size=100, buffer_freq=10, train_freq=10, reward_scaling=0.1, policy_freq_high=2, policy_freq_low=2, gpu=gpu) if args.load: # load weights from agent if arg supplied agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # train the hierarchical agent. experiments.train_hrl_agent(agent=agent, env=make_panda_env(0, test=False), subgoal=subgoal_space, steps=args.steps, outdir=args.outdir)
def main(): args = parse_rl_args() logging.basicConfig(level=args.log_level) if args.env not in FETCH_ENVS: raise Exception(f"Invalid environemt, please select from {FETCH_ENVS}") # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): env = gym.make(args.env) # Unwrap TimeLimit wrapper # fetch env is unique - it's conditioned on a goal assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx in range(args.num_envs) ]) env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space_dict = env.observation_space action_space = env.action_space print("Observation space dictionary:", obs_space_dict) print("Action space:", action_space) # size of the subgoal is a hyperparameter env_subgoal_dim = 3 # TODO - change the limits, they are completely wrong limits = np.array([0.2, 0.2, 0.2]) subgoal_space = gym.spaces.Box(low=limits * -1, high=limits) setattr(env, "subgoal_space", subgoal_space) env_state_dim = obs_space_dict.spaces['observation'].low.size env_goal_dim = obs_space_dict.spaces['desired_goal'].low.size env_action_dim = action_space.low.size scale_low = action_space.high * np.ones(env_action_dim) scale_high = subgoal_space.high * np.ones(env_subgoal_dim) def low_level_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) def high_level_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(subgoal_space.low, subgoal_space.high).astype(np.float32) gpu = 0 if torch.cuda.is_available() else None agent = HIROAgent( state_dim=env_state_dim, action_dim=env_action_dim, goal_dim=env_goal_dim, subgoal_dim=env_subgoal_dim, high_level_burnin_action_func=high_level_burnin_action_func, low_level_burnin_action_func=low_level_burnin_action_func, scale_low=scale_low, scale_high=scale_high, buffer_size=200000, subgoal_freq=10, train_freq=10, reward_scaling=0.1, goal_threshold=0.1, gpu=gpu, add_entropy=args.add_entropy) if args.load: # load weights from a file if arg supplied agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=300, video_outdir=args.outdir, step_number=-1 if args.record else None # justNonNoneObjectForRecording ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # train the hierarchical agent train_env = make_env(test=False) setattr(train_env, "subgoal_space", subgoal_space) experiments.train_hrl_agent_with_evaluation( agent=agent, env=train_env, steps=args.steps, outdir=args.outdir, eval_n_steps=None, eval_interval=5000, eval_n_episodes=10, use_tensorboard=True, train_max_episode_len=timestep_limit, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--outdir", type=str, default="results") parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--steps", type=int, default=8 * 10**7) parser.add_argument("--update-steps", type=int, default=5) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") parser.add_argument("--rmsprop-epsilon", type=float, default=1e-5) parser.add_argument( "--use-gae", action="store_true", default=False, help="use generalized advantage estimation", ) parser.add_argument("--tau", type=float, default=0.95, help="gae parameter") parser.add_argument("--alpha", type=float, default=0.99, help="RMSprop optimizer alpha") parser.add_argument("--eval-interval", type=int, default=10**6) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument("--max-grad-norm", type=float, default=40, help="value loss coefficient") parser.add_argument( "--gpu", "-g", type=int, default=-1, help="GPU ID (negative value indicates CPU)", ) parser.add_argument("--num-envs", type=int, default=1) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.set_defaults(use_lstm=False) args = parser.parse_args() logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(0, test=False) obs_channel_size = sample_env.observation_space.low.shape[0] n_actions = sample_env.action_space.n model = nn.Sequential( nn.Conv2d(obs_channel_size, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), pfrl.nn.Branched( nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), nn.Linear(256, 1), ), ) optimizer = pfrl.optimizers.RMSpropEpsInsideSqrt( model.parameters(), lr=args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha, ) agent = a2c.A2C( model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, phi=phi, use_gae=args.use_gae, tau=args.tau, max_grad_norm=args.max_grad_norm, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, ) print("n_runs: {} mean: {} median: {} stdev: {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--load", type=str, default=None) parser.add_argument("--eval-epsilon", type=float, default=0.0) parser.add_argument("--noisy-net-sigma", type=float, default=0.2) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=2 * 10**3) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--n-best-episodes", type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed test_ID = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S") args.outdir = experiments.prepare_output_dir(args, args.outdir, test_ID) print("Output files are saved in {}".format(args.outdir)) env = MapRootEnv() eval_env = MapRootEnv() n_actions = env.action_space.n input_shape = env.input_shape n_atoms = 51 v_max = 10 v_min = -10 q_func = MyDistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max, input_shape[2]) # Noisy nets pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same hyper parameters as https://arxiv.org/abs/1710.02298 opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = args.steps / update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 5 * 10**4, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=3, normalize_by_max="batch", ) Agent = agents.CategoricalDoubleDQN agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.80, explorer=explorer, minibatch_size=32, replay_start_size=args.replay_start_size, target_update_interval=32000, update_interval=update_interval, batch_accumulator="mean") if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_episodes: {} mean: {} median: {} stdev {}".format( eval_stats["episodes"], eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, logger=TBLogger(args.outdir)) # dir_of_best_network = os.path.join(args.outdir, "best") # agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None) with open(os.path.join(args.outdir, "bestscores.json"), "w") as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--num-envs", type=int, default=1, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=2500, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--update-interval", type=int, default=1, help="Interval in timesteps between model updates.", ) parser.add_argument("--batch-size", type=int, default=100, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--monitor", action="store_true", help="Wrap env with Monitor to write videos.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--n-hidden-channels", type=int, default=256, help="Number of hidden channels of NN models.", ) parser.add_argument( "--env", default="AntMaze", help= "Type of Ant Env to use. Options are AntMaze, AntFall, and AntPush.", type=str) parser.add_argument("--discount", type=float, default=0.99, help="Discount factor.") parser.add_argument("--n-step-return", type=int, default=3, help="N-step return.") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.") parser.add_argument("--adam-eps", type=float, default=1e-1, help="Adam eps.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_ant_env(idx, test): # use different seeds for train vs test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed # env_seed = np.random.randint(0, 2**32 - 1) if not test else process_seed print('seed', env_seed) utils.set_random_seed(env_seed) # create the anv environment with goal env = AntEnvWithGoal(create_maze_env(args.env), args.env, env_subgoal_dim=15) env.seed(int(env_seed)) if args.render: env = pfrl.wrappers.GymLikeEnvRender(env, mode='human') return env eval_env = make_ant_env(0, test=True) env_state_dim = eval_env.state_dim env_action_dim = eval_env.action_dim if args.env == 'AntMaze' or args.env == 'AntPush': env_goal_dim = 2 else: env_goal_dim = 3 action_size = env_action_dim action_space = eval_env.action_space scale_low = action_space.high * np.ones(env_action_dim) def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(env_state_dim + env_goal_dim, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=0.0001) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(env_state_dim + env_goal_dim + env_action_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=0.001) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(200000) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) if args.gpu is not None and args.gpu >= 0: assert torch.cuda.is_available() device = torch.device("cuda:{}".format(args.gpu)) else: device = torch.device("cpu") # Hyperparameters in http://arxiv.org/abs/1802.09477 scale_tensor = torch.tensor(scale_low).float().to(device) agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=args.discount, update_interval=args.update_interval, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=args.lr, scale=scale_tensor) if len(args.load) > 0: agent.load(args.load) if args.demo: eval_env = make_env(args, seed=0, test=True) eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_goal_conditioned_agent_with_evaluation( agent=agent, env=make_ant_env(0, test=False), steps=args.steps, eval_n_steps=None, outdir=args.outdir, eval_n_episodes=args.eval_n_runs, eval_interval=5000, use_tensorboard=True, )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument( "--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU." ) parser.add_argument( "--env", type=str, default="reach_target-ee-vision-v0", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument( "--num-envs", type=int, default=1, help="Number of envs run in parallel." ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( "--outdir", type=str, default="results", help=( "Directory path to save output files." " If it does not exist, it will be created." ), ) parser.add_argument( "--steps", type=int, default=2 * 10 ** 6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--eval-n-runs", type=int, default=100, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--render", action="store_true", help="Render env states in a GUI window." ) parser.add_argument( "--demo", action="store_true", help="Just run evaluation, not training." ) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument( "--load", type=str, default="", help="Directory to load agent from." ) parser.add_argument( "--log-level", type=int, default=logging.INFO, help="Level of the root logger." ) parser.add_argument( "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor." ) parser.add_argument( "--log-interval", type=int, default=1000, help="Interval in timesteps between outputting log messages during training", ) parser.add_argument( "--update-interval", type=int, default=2048, help="Interval in timesteps between model updates.", ) parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs to update model for per PPO iteration.", ) parser.add_argument( "--action-size", type=int, default=3, help="Action size (needs to match env.action_space)", ) parser.add_argument("--batch-size", type=int, default=64, help="Minibatch size") args = parser.parse_args() logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): render_mode = 'human' if args.render else None env = NormalizeAction(GraspActionWrapper(FlattenObservation(ResizeObservation(WristObsWrapper(gym.make(args.env, render_mode=render_mode)), (64, 64))), args.action_size)) # env = GraspActionWrapper(RescaleAction(FlattenObservation(ResizeObservation(WristObsWrapper(gym.make(args.env)), (64, 64))), -0.5, 0.5)) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): return MultiprocessVectorEnv( [ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ] ) # Only for getting timesteps, and obs-action spaces # sample_env = RescaleAction(GraspActionWrapper(FlattenObservation(ResizeObservation(WristObsWrapper(gym.make(args.env)), (64, 64))), args.action_size), -0.5, 0.5) # timestep_limit = sample_env.spec.max_episode_steps timestep_limit = 200 # obs_space = sample_env.observation_space obs_space = spaces.Box(low=0, high=1, shape=(64 * 64 * 3,)) # action_space = sample_env.action_space action_space = spaces.Box(low=-1.0, high=1.0, shape=(args.action_size,)) print("Observation space:", obs_space) print("Action space:", action_space) # assert obs_space == spaces.Box(low=0, high=1, shape=(64 * 64 * 3,)) # assert action_space == spaces.Box(low=-1.0, high=1.0, shape=(args.action_size,)) # sample_env.close() assert isinstance(action_space, gym.spaces.Box) # Normalize observations based on their empirical mean and variance obs_normalizer = pfrl.nn.EmpiricalNormalization( obs_space.low.size, clip_threshold=5 ) obs_size = obs_space.low.size action_size = action_space.low.size policy = torch.nn.Sequential( nn.Linear(obs_size, 64), nn.Tanh(), nn.Linear(64, 64), nn.Tanh(), nn.Linear(64, action_size), pfrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type="diagonal", var_func=lambda x: torch.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) vf = torch.nn.Sequential( nn.Linear(obs_size, 64), nn.Tanh(), nn.Linear(64, 64), nn.Tanh(), nn.Linear(64, 1), ) # While the original paper initialized weights by normal distribution, # we use orthogonal initialization as the latest openai/baselines does. def ortho_init(layer, gain): nn.init.orthogonal_(layer.weight, gain=gain) nn.init.zeros_(layer.bias) ortho_init(policy[0], gain=1) ortho_init(policy[2], gain=1) ortho_init(policy[4], gain=1e-2) ortho_init(vf[0], gain=1) ortho_init(vf[2], gain=1) ortho_init(vf[4], gain=1) # Combine a policy and a value function into a single model model = pfrl.nn.Branched(policy, vf) opt = torch.optim.Adam(model.parameters(), lr=3e-4, eps=1e-5) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batch_size, epochs=args.epochs, clip_eps_vf=None, entropy_coef=0, standardize_advantages=True, gamma=0.995, lambd=0.97, ) if args.load or args.load_pretrained: if args.load_pretrained: raise Exception("Pretrained models are currently unsupported.") # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load(utils.download_model("PPO", args.env, model_type="final")[0]) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print( "n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], ) ) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, save_best_so_far_agent=True, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.01) parser.add_argument("--eval-epsilon", type=float, default=0.001) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument( "--arch", type=str, default="doubledqn", choices=["nature", "nips", "dueling", "doubledqn"], ) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) parser.add_argument("--target-update-interval", type=int, default=3 * 10**4) parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--no-clip-delta", dest="clip_delta", action="store_false") parser.set_defaults(clip_delta=True) parser.add_argument("--agent", type=str, default="DoubleDQN", choices=["DQN", "DoubleDQN", "PAL"]) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate") parser.add_argument( "--prioritized", action="store_true", default=False, help="Use prioritized experience replay.", ) parser.add_argument("--num-envs", type=int, default=1) parser.add_argument("--n-step-return", type=int, default=1) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, frame_stack=False, ) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) env.seed(env_seed) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): vec_env = pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) vec_env = pfrl.wrappers.VectorFrameStack(vec_env, 4) return vec_env sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same hyper parameters as the Nature paper's opt = optim.RMSprop( q_func.parameters(), lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.n_step_return, ) else: rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator="sum", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("processes", type=int) parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--t-max", type=int, default=5) parser.add_argument("--replay-start-size", type=int, default=10000) parser.add_argument("--n-times-replay", type=int, default=4) parser.add_argument("--beta", type=float, default=1e-2) parser.add_argument("--profile", action="store_true") parser.add_argument("--steps", type=int, default=10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--use-lstm", action="store_true") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.set_defaults(use_lstm=False) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) n_actions = gym.make(args.env).action_space.n input_to_hidden = nn.Sequential( nn.Conv2d(4, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), ) head = acer.ACERDiscreteActionHead( pi=nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), q=nn.Sequential( nn.Linear(256, n_actions), DiscreteActionValueHead(), ), ) if args.use_lstm: model = pfrl.nn.RecurrentSequential( input_to_hidden, nn.LSTM(num_layers=1, input_size=256, hidden_size=256), head, ) else: model = nn.Sequential(input_to_hidden, head) model.apply(pfrl.initializers.init_chainer_default) opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(), lr=args.lr, eps=4e-3, alpha=0.99) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = acer.ACER( model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=phi, max_grad_norm=40, recurrent=args.use_lstm, ) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): for pg in agent.optimizer.param_groups: assert "lr" in pg pg["lr"] = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--load", type=str, default=None) parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.01) parser.add_argument("--eval-epsilon", type=float, default=0.001) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) parser.add_argument("--target-update-interval", type=int, default=10**4) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--batch-size", type=int, default=32) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--batch-accumulator", type=str, default="mean", choices=["mean", "sum"]) parser.add_argument("--quantile-thresholds-N", type=int, default=64) parser.add_argument("--quantile-thresholds-N-prime", type=int, default=64) parser.add_argument("--quantile-thresholds-K", type=int, default=32) parser.add_argument("--n-best-episodes", type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = pfrl.agents.iqn.ImplicitQuantileQFunction( psi=nn.Sequential( nn.Conv2d(4, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, 3, stride=1), nn.ReLU(), nn.Flatten(), ), phi=nn.Sequential( pfrl.agents.iqn.CosineBasisLinear(64, 3136), nn.ReLU(), ), f=nn.Sequential( nn.Linear(3136, 512), nn.ReLU(), nn.Linear(512, n_actions), ), ) # Use the same hyper parameters as https://arxiv.org/abs/1710.10044 opt = torch.optim.Adam(q_func.parameters(), lr=5e-5, eps=1e-2 / args.batch_size) rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = pfrl.agents.IQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator=args.batch_accumulator, phi=phi, quantile_thresholds_N=args.quantile_thresholds_N, quantile_thresholds_N_prime=args.quantile_thresholds_N_prime, quantile_thresholds_K=args.quantile_thresholds_K, ) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( utils.download_model("IQN", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None, ) print("n_steps: {} mean: {} median: {} stdev {}".format( args.eval_n_steps, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None, ) with open(os.path.join(args.outdir, "bestscores.json"), "w") as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="Hopper-v2", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=100, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor.") parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) def make_env(test): env = gym.make(args.env) # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size policy = nn.Sequential( nn.Linear(obs_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, action_size), nn.Tanh(), pfrl.policies.DeterministicHead(), ) policy_optimizer = torch.optim.Adam(policy.parameters()) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters()) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.AdditiveGaussian(scale=0.1, low=action_space.low, high=action_space.high) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.TD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, ) if len(args.load) > 0 or args.load_pretrained: # either load or load_pretrained must be false assert not len(args.load) > 0 or not args.load_pretrained if len(args.load) > 0: agent.load(args.load) else: agent.load( utils.download_model("TD3", args.env, model_type=args.pretrained_type)[0]) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) import json import os with open(os.path.join(args.outdir, "demo_scores.json"), "w") as f: json.dump(eval_stats, f) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_env=eval_env, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, train_max_episode_len=timestep_limit, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--env", type=str, default="BreakoutNoFrameskip-v4", help="OpenAI Atari domain to perform algorithm on.", ) parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--load", type=str, default=None) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument( "--steps", type=int, default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--replay-start-size", type=int, default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument("--n-best-episodes", type=int, default=30) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=None), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = nn.Sequential( pnn.LargeAtariCNN(), init_chainer_default(nn.Linear(512, n_actions)), DiscreteActionValueHead(), ) # Use the same hyperparameters as the Nature paper opt = pfrl.optimizers.RMSpropEpsInsideSqrt( q_func.parameters(), lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.DQN agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator="sum", phi=phi, ) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( utils.download_model("DQN", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_episodes: {} mean: {} median: {} stdev {}".format( eval_stats["episodes"], eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 30 evaluation episodes, each capped at 5 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=4500, logger=None, ) with open(os.path.join(args.outdir, "bestscores.json"), "w") as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="RoboschoolAtlasForwardWalk-v1", help="OpenAI Gym env to perform algorithm on.", ) parser.add_argument("--num-envs", type=int, default=4, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=20, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--update-interval", type=int, default=1, help="Interval in timesteps between model updates.", ) parser.add_argument("--batch-size", type=int, default=256, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--monitor", action="store_true", help="Wrap env with Monitor to write videos.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--n-hidden-channels", type=int, default=1024, help="Number of hidden channels of NN models.", ) parser.add_argument("--discount", type=float, default=0.98, help="Discount factor.") parser.add_argument("--n-step-return", type=int, default=3, help="N-step return.") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.") parser.add_argument("--adam-eps", type=float, default=1e-1, help="Adam eps.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, args, process_seeds[idx], test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(args, process_seeds[0], test=False) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print("Observation space:", obs_space) print("Action space:", action_space) del sample_env action_size = action_space.low.size def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(obs_space.low.size, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr, eps=args.adam_eps) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_space.low.size + action_size, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=args.lr, eps=args.adam_eps) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=args.discount, update_interval=args.update_interval, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=args.lr, ) if len(args.load) > 0: agent.load(args.load) if args.demo: eval_env = make_env(args, seed=0, test=True) eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="CartPole-v0") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--beta", type=float, default=1e-4) parser.add_argument("--batchsize", type=int, default=10) parser.add_argument("--steps", type=int, default=10**5) parser.add_argument("--eval-interval", type=int, default=10**4) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--reward-scale-factor", type=float, default=1e-2) parser.add_argument("--render", action="store_true", default=False) parser.add_argument("--lr", type=float, default=1e-3) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument("--log-level", type=int, default=logging.INFO) parser.add_argument("--monitor", action="store_true") args = parser.parse_args() logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and not test: env = pfrl.wrappers.Render(env) return env train_env = make_env(test=False) timestep_limit = train_env.spec.max_episode_steps obs_space = train_env.observation_space action_space = train_env.action_space obs_size = obs_space.low.size hidden_size = 200 # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Box): model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(0.2), nn.Linear(hidden_size, hidden_size), nn.LeakyReLU(0.2), nn.Linear(hidden_size, action_space.low.size), GaussianHeadWithFixedCovariance(0.3), ) else: model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(0.2), nn.Linear(hidden_size, hidden_size), nn.LeakyReLU(0.2), nn.Linear(hidden_size, action_space.n), SoftmaxCategoricalHead(), ) opt = torch.optim.Adam(model.parameters(), lr=args.lr) agent = pfrl.agents.REINFORCE( model, opt, gpu=args.gpu, beta=args.beta, batchsize=args.batchsize, max_grad_norm=1.0, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=train_env, eval_env=eval_env, outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit, )
def main(): parser = argparse.ArgumentParser() # training parameters parser.add_argument( "--env", type=str, default="LunarLander-v2", help="OpenAI Gym Environment ID.", ) parser.add_argument( "--outdir", type=str, default="results", help=( "Directory path to save output files." " If it does not exist, it will be created." ), ) parser.add_argument( "--seed", type=int, default=0, help="Random seed for randomizer.", ) parser.add_argument( "--monitor", action="store_true", default=False, help=( "Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument( "--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU." ) parser.add_argument( "--steps", type=int, default=4 * 10**5, help="Total number of timesteps to train the agent for each trial", ) parser.add_argument( "--train-max-episode-len", type=int, default=1000, help="Maximum episode length during training.", ) parser.add_argument( "--eval-n-episodes", type=int, default=10, help="Number of episodes at each evaluation phase.", ) parser.add_argument( "--eval-interval", type=int, default=10**4, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( "--batch-size", type=int, default=64, help="Training batch size.", ) # Optuna related args parser.add_argument( "--optuna-study-name", type=str, default="optuna-pfrl-quickstart", help="Name for Optuna Study.", ) parser.add_argument( "--optuna-storage", type=str, default="sqlite:///example.db", help=( "DB URL for Optuna Study. Be sure to create one beforehand: " "optuna create-study --study-name <name> --storage <storage> --direction maximize" # noqa ), ) parser.add_argument( "--optuna-training-steps-budget", type=int, default=4 * 10**7, help=( "Total training steps thoughout the optimization. If the pruner works " "well, this limited training steps can be allocated to promissing trials " "efficiently, and thus the tuned hyperparameter should get better." ), ) parser.add_argument( "--optuna-pruner", type=str, default="NopPruner", choices=["NopPruner", "ThresholdPruner", "PercentilePruner", "HyperbandPruner"], help=( "Optuna pruner. For more details see: " "https://optuna.readthedocs.io/en/stable/reference/pruners.html" ), ) # add pruner specific arguments... _tmp_args, _unknown = parser.parse_known_args() n_warmup_steps_help_msg = ( "Don't prune for first `n_warmup_steps` steps for each trial (pruning check " "will be invoked every `eval_interval` step). Note that `step` for the pruner " "is the training step, not the number of evaluations so far." ) if _tmp_args.optuna_pruner == "NopPruner": pass elif _tmp_args.optuna_pruner == "ThresholdPruner": parser.add_argument( "--lower", type=float, required=True, help=( "Lower side threshold score for pruning trials. " "Please set the appropriate value for your specified env." ), ) parser.add_argument( "--n-warmup-steps", type=int, default=5 * _tmp_args.eval_interval, help=n_warmup_steps_help_msg, ) elif _tmp_args.optuna_pruner == "PercentilePruner": parser.add_argument( "--percentile", type=float, default=50.0, help="Setting percentile == 50.0 is equivalent to the MedianPruner.", ) parser.add_argument( "--n-startup-trials", type=int, default=5, ) parser.add_argument( "--n-warmup-steps", type=int, default=5 * _tmp_args.eval_interval, help=n_warmup_steps_help_msg, ) elif _tmp_args.optuna_pruner == "HyperbandPruner": pass args = parser.parse_args() rootdir = experiments.prepare_output_dir(args=args, basedir=args.outdir) file_handler = logging.FileHandler(filename=os.path.join(rootdir, "console.log")) console_handler = logging.StreamHandler() logging.basicConfig(level=logging.INFO, handlers=[file_handler, console_handler]) randomizer = random.Random(args.seed) def objective(trial): # suggest parameters from Optuna hyperparams = suggest(trial, args.steps) # seed is generated for each objective seed = randomizer.randint(0, 2**31 - 1) additional_args = dict(seed=seed, **hyperparams) outdir = experiments.prepare_output_dir(args=additional_args, basedir=rootdir) print("Output files are saved in {}".format(outdir)) return _objective_core( # optuna parameters trial=trial, # training parameters env_id=args.env, outdir=outdir, seed=seed, monitor=args.monitor, gpu=args.gpu, steps=args.steps, train_max_episode_len=args.train_max_episode_len, eval_n_episodes=args.eval_n_episodes, eval_interval=args.eval_interval, batch_size=args.batch_size, # hyperparameters hyperparams=hyperparams, ) sampler = optuna.samplers.TPESampler(seed=args.seed) # pruner if args.optuna_pruner == "NopPruner": pruner = optuna.pruners.NopPruner() elif args.optuna_pruner == "ThresholdPruner": pruner = optuna.pruners.ThresholdPruner( lower=args.lower, n_warmup_steps=args.n_warmup_steps, ) elif args.optuna_pruner == "PercentilePruner": pruner = optuna.pruners.PercentilePruner( percentile=args.percentile, n_startup_trials=args.n_startup_trials, n_warmup_steps=args.n_warmup_steps, ) elif args.optuna_pruner == "HyperbandPruner": pruner = optuna.pruners.HyperbandPruner(min_resource=args.eval_interval) study = optuna.load_study( study_name=args.optuna_study_name, storage=args.optuna_storage, sampler=sampler, pruner=pruner, ) class OptunaTrainingStepsBudgetCallback: def __init__(self, training_steps_budget, logger=None): self.training_steps_budget = training_steps_budget self.logger = logger or logging.getLogger(__name__) def __call__(self, study, trial): training_steps = sum( trial.last_step for trial in study.get_trials() if trial.last_step is not None ) self.logger.info( "{} / {} (sum of training steps / budget)".format( training_steps, self.training_steps_budget ) ) if training_steps >= self.training_steps_budget: study.stop() callbacks = [ OptunaTrainingStepsBudgetCallback( training_steps_budget=args.optuna_training_steps_budget, ), ] study.optimize(objective, callbacks=callbacks)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4", help="Gym Env ID.") parser.add_argument("--gpu", type=int, default=0, help="GPU device ID. Set to -1 to use CPUs only.") parser.add_argument( "--num-envs", type=int, default=8, help="Number of env instances run in parallel.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--steps", type=int, default=10**7, help="Total time steps for training.") parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate.") parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval (in timesteps) between evaluation phases.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes ran in an evaluation phase.", ) parser.add_argument( "--demo", action="store_true", default=False, help="Run demo episodes, not training.", ) parser.add_argument( "--load", type=str, default="", help=("Directory path to load a saved agent data from" " if it is a non-empty string."), ) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument( "--update-interval", type=int, default=128 * 8, help="Interval (in timesteps) between PPO iterations.", ) parser.add_argument( "--batchsize", type=int, default=32 * 8, help="Size of minibatch (in timesteps).", ) parser.add_argument( "--epochs", type=int, default=4, help="Number of epochs used for each PPO iteration.", ) parser.add_argument( "--log-interval", type=int, default=10000, help="Interval (in timesteps) of printing logs.", ) parser.add_argument( "--recurrent", action="store_true", default=False, help="Use a recurrent model. See the code for the model definition.", ) parser.add_argument( "--flicker", action="store_true", default=False, help=("Use so-called flickering Atari, where each" " screen is blacked out with probability 0.5."), ) parser.add_argument( "--no-frame-stack", action="store_true", default=False, help= ("Disable frame stacking so that the agent can only see the current screen." ), ) parser.add_argument( "--checkpoint-frequency", type=int, default=None, help="Frequency at which agents are stored.", ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=False, ) env.seed(env_seed) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): vec_env = pfrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) if not args.no_frame_stack: vec_env = pfrl.wrappers.VectorFrameStack(vec_env, 4) return vec_env sample_env = make_batch_env(test=False) print("Observation space", sample_env.observation_space) print("Action space", sample_env.action_space) n_actions = sample_env.action_space.n obs_n_channels = sample_env.observation_space.low.shape[0] del sample_env def lecun_init(layer, gain=1): if isinstance(layer, (nn.Conv2d, nn.Linear)): pfrl.initializers.init_lecun_normal(layer.weight, gain) nn.init.zeros_(layer.bias) else: pfrl.initializers.init_lecun_normal(layer.weight_ih_l0, gain) pfrl.initializers.init_lecun_normal(layer.weight_hh_l0, gain) nn.init.zeros_(layer.bias_ih_l0) nn.init.zeros_(layer.bias_hh_l0) return layer if args.recurrent: model = pfrl.nn.RecurrentSequential( lecun_init(nn.Conv2d(obs_n_channels, 32, 8, stride=4)), nn.ReLU(), lecun_init(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), lecun_init(nn.Conv2d(64, 64, 3, stride=1)), nn.ReLU(), nn.Flatten(), lecun_init(nn.Linear(3136, 512)), nn.ReLU(), lecun_init(nn.GRU(num_layers=1, input_size=512, hidden_size=512)), pfrl.nn.Branched( nn.Sequential( lecun_init(nn.Linear(512, n_actions), 1e-2), SoftmaxCategoricalHead(), ), lecun_init(nn.Linear(512, 1)), ), ) else: model = nn.Sequential( lecun_init(nn.Conv2d(obs_n_channels, 32, 8, stride=4)), nn.ReLU(), lecun_init(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), lecun_init(nn.Conv2d(64, 64, 3, stride=1)), nn.ReLU(), nn.Flatten(), lecun_init(nn.Linear(3136, 512)), nn.ReLU(), pfrl.nn.Branched( nn.Sequential( lecun_init(nn.Linear(512, n_actions), 1e-2), SoftmaxCategoricalHead(), ), lecun_init(nn.Linear(512, 1)), ), ) opt = torch.optim.Adam(model.parameters(), lr=args.lr, eps=1e-5) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=True, entropy_coef=1e-2, recurrent=args.recurrent, max_grad_norm=0.5, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, ) print("n_runs: {} mean: {} median: {} stdev: {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: step_hooks = [] # Linearly decay the learning rate to zero def lr_setter(env, agent, value): for param_group in agent.optimizer.param_groups: param_group["lr"] = value step_hooks.append( experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter)) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, checkpoint_freq=args.checkpoint_frequency, eval_interval=args.eval_interval, log_interval=args.log_interval, save_best_so_far_agent=False, step_hooks=step_hooks, )
def main(): import logging logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--env", type=str, default="CartPole-v1") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--final-exploration-steps", type=int, default=1000) parser.add_argument("--start-epsilon", type=float, default=1.0) parser.add_argument("--end-epsilon", type=float, default=0.1) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--steps", type=int, default=10**8) parser.add_argument("--prioritized-replay", action="store_true") parser.add_argument("--replay-start-size", type=int, default=50) parser.add_argument("--target-update-interval", type=int, default=100) parser.add_argument("--target-update-method", type=str, default="hard") parser.add_argument("--soft-update-tau", type=float, default=1e-2) parser.add_argument("--update-interval", type=int, default=1) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--eval-interval", type=int, default=1000) parser.add_argument("--n-hidden-channels", type=int, default=12) parser.add_argument("--n-hidden-layers", type=int, default=3) parser.add_argument("--gamma", type=float, default=0.95) parser.add_argument("--minibatch-size", type=int, default=None) parser.add_argument("--render-train", action="store_true") parser.add_argument("--render-eval", action="store_true") parser.add_argument("--monitor", action="store_true") parser.add_argument("--reward-scale-factor", type=float, default=1.0) args = parser.parse_args() # Set a random seed used in PFRL utils.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) def make_env(test): env = gym.make(args.env) env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor) if (args.render_eval and test) or (args.render_train and not test): env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_size = env.observation_space.low.size action_space = env.action_space n_atoms = 51 v_max = 500 v_min = 0 n_actions = action_space.n q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_atoms, v_min, v_max, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample, ) opt = torch.optim.Adam(q_func.parameters(), 1e-3) rbuf_capacity = 50000 # 5 * 10 ** 5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffers.ReplayBuffer(rbuf_capacity) agent = pfrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit, )
def main(): args = parse_rl_args() logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_ant_env(idx, test): # use different seeds for train vs test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed # env_seed = np.random.randint(0, 2**32 - 1) if not test else process_seed print(env_seed) utils.set_random_seed(env_seed) # create the anv environment with goal env = AntEnvWithGoal(create_maze_env(args.env), args.env, env_subgoal_dim=15) env.seed(int(env_seed)) if args.render: env = pfrl.wrappers.GymLikeEnvRender(env, mode='human') return env def make_batch_ant__env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_ant_env, idx, test) for idx in range(args.num_envs) ]) eval_env = make_ant_env(0, test=True) env_state_dim = eval_env.state_dim env_action_dim = eval_env.action_dim env_subgoal_dim = eval_env.subgoal_dim # determined from the ant env if args.env == 'AntMaze' or args.env == 'AntPush': env_goal_dim = 2 else: env_goal_dim = 3 action_space = eval_env.action_space subgoal_space = eval_env.subgoal_space scale_low = action_space.high * np.ones(env_action_dim) scale_high = subgoal_space.high * np.ones(env_subgoal_dim) def low_level_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) def high_level_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(subgoal_space.low, subgoal_space.high).astype(np.float32) gpu = 0 if torch.cuda.is_available() else None agent = HIROAgent( state_dim=env_state_dim, action_dim=env_action_dim, goal_dim=env_goal_dim, subgoal_dim=env_subgoal_dim, high_level_burnin_action_func=high_level_burnin_action_func, low_level_burnin_action_func=low_level_burnin_action_func, scale_low=scale_low, scale_high=scale_high, buffer_size=200000, subgoal_freq=10, train_freq=10, reward_scaling=0.1, goal_threshold=5, gpu=gpu, add_entropy_layer=args.add_entropy_layer, soft_subgoal_update=args.soft_subgoal_update, temperature_high=args.temperature_high, temperature_low=args.temperature_low, optimize_high_temp=args.optimize_high_temp, optimize_low_temp=args.optimize_low_temp) if args.load: # load weights from a file if arg supplied agent.load(args.load) if args.record: from mujoco_py import GlfwContext GlfwContext(offscreen=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, video_outdir=args.outdir, step_number=-1 if args.record else None # justNonNoneObjectForRecording ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # train the hierarchical agent experiments.train_hrl_agent_with_evaluation(agent=agent, env=make_ant_env( 0, test=False), steps=args.steps, outdir=args.outdir, eval_n_steps=None, eval_interval=5000, eval_n_episodes=10, use_tensorboard=True, record=args.record)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.1) parser.add_argument("--eval-epsilon", type=float, default=0.05) parser.add_argument("--steps", type=int, default=10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) parser.add_argument("--target-update-interval", type=int, default=10**4) parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--batch-size", type=int, default=32) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = torch.nn.Sequential( pfrl.nn.LargeAtariCNN(), pfrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction( 512, n_actions, n_atoms, v_min, v_max, n_hidden_channels=0, n_hidden_layers=0, ), ) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = torch.optim.Adam(q_func.parameters(), 2.5e-4, eps=1e-2 / args.batch_size) rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = pfrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator="mean", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, eval_env=eval_env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=( "Directory path to save output files." " If it does not exist, it will be created." ), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument( "--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU." ) parser.add_argument( "--demo", action="store_true", default=False, help="Evaluate the agent without training.", ) parser.add_argument( "--load", type=str, default=None, help="Load a saved agent from a given directory.", ) parser.add_argument( "--final-exploration-steps", type=int, default=5 * 10 ** 5, help="Timesteps after which we stop annealing exploration rate", ) parser.add_argument( "--final-epsilon", type=float, default=0.2, help="Final value of epsilon during training.", ) parser.add_argument( "--steps", type=int, default=2 * 10 ** 6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--replay-start-size", type=int, default=5 * 10 ** 4, help="Minimum replay buffer size before performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, default=1 * 10 ** 4, help="Frequency (in timesteps) at which the target network is updated.", ) parser.add_argument( "--eval-interval", type=int, default=10 ** 5, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( "--update-interval", type=int, default=1, help="Frequency (in timesteps) of network updates.", ) parser.add_argument( "--eval-n-runs", type=int, default=100, help="Number of episodes used for evaluation.", ) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument( "--num-envs", type=int, default=1, help="Number of envs run in parallel." ) parser.add_argument( "--batch-size", type=int, default=32, help="Batch size used for training." ) parser.add_argument( "--record", action="store_true", default=False, help="Record videos of evaluation envs. --render should also be specified.", ) parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor.") args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) max_episode_steps = 8 def make_env(idx, test): from pybullet_envs.bullet.kuka_diverse_object_gym_env import ( KukaDiverseObjectEnv, ) # NOQA # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed # Set a random seed for this subprocess utils.set_random_seed(env_seed) env = KukaDiverseObjectEnv( isDiscrete=True, renders=args.render and (args.demo or not test), height=84, width=84, maxSteps=max_episode_steps, isTest=test, ) # Disable file caching to keep memory usage small env._p.setPhysicsEngineParameter(enableFileCaching=False) assert env.observation_space is None env.observation_space = gym.spaces.Box( low=0, high=255, shape=(84, 84, 3), dtype=np.uint8 ) # (84, 84, 3) -> (3, 84, 84) env = TransposeObservation(env, (2, 0, 1)) env = ObserveElapsedSteps(env, max_episode_steps) # KukaDiverseObjectEnv internally asserts int actions env = CastAction(env, int) env.seed(int(env_seed)) if test and args.record: assert args.render, "To use --record, --render needs be specified." video_dir = os.path.join(args.outdir, "video_{}".format(idx)) os.mkdir(video_dir) env = RecordMovie(env, video_dir) return env def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv( [functools.partial(make_env, idx, test) for idx in range(args.num_envs)] ) eval_env = make_batch_env(test=True) n_actions = eval_env.action_space.n q_func = GraspingQFunction(n_actions, max_episode_steps) # Use the hyper parameters of the Nature paper opt = pfrl.optimizers.RMSpropEpsInsideSqrt( q_func.parameters(), lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10 ** 6, alpha=0.6, beta0=0.4, betasteps=betasteps ) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_steps, lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor image, elapsed_steps = x # Normalize RGB values: [0, 255] -> [0, 1] norm_image = np.asarray(image, dtype=np.float32) / 255 return norm_image, elapsed_steps agent = pfrl.agents.DoubleDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, minibatch_size=args.batch_size, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator="sum", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs ) print( "n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], ) ) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=eval_env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="SlimeVolley-v0") parser.add_argument( "--outdir", type=str, default="results", help=( "Directory path to save output files." " If it does not exist, it will be created." ), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--noisy-net-sigma", type=float, default=0.1) parser.add_argument("--steps", type=int, default=2 * 10 ** 6) parser.add_argument("--replay-start-size", type=int, default=1600) parser.add_argument("--eval-n-episodes", type=int, default=1000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help=( "Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--gamma", type=float, default=0.98) parser.add_argument("--v-max", type=float, default=1) parser.add_argument("--n-step-return", type=int, default=3) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2 ** 31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): if "SlimeVolley" in args.env: # You need to install slimevolleygym import slimevolleygym # NOQA env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training" ) if args.render: env = pfrl.wrappers.Render(env) if isinstance(env.action_space, gym.spaces.MultiBinary): env = MultiBinaryAsDiscreteAction(env) return env env = make_env(test=False) eval_env = make_env(test=True) obs_size = env.observation_space.low.size n_actions = env.action_space.n n_atoms = 51 v_max = args.v_max v_min = -args.v_max hidden_size = 512 q_func = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU(), DistributionalDuelingHead(hidden_size, n_actions, n_atoms, v_min, v_max), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) # Noisy nets pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same eps as https://arxiv.org/abs/1710.02298 opt = torch.optim.Adam(q_func.parameters(), 1e-4, eps=1.5e-4) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 1 betasteps = args.steps / update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10 ** 6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=args.n_step_return, normalize_by_max="memory", ) agent = agents.CategoricalDoubleDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, minibatch_size=32, replay_start_size=args.replay_start_size, target_update_interval=2000, update_interval=update_interval, batch_accumulator="mean", phi=phi, max_grad_norm=10, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_episodes, ) print( "n_episodes: {} mean: {} median: {} stdev {}".format( eval_stats["episodes"], eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], ) ) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_episodes, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--env", type=str, default="BreakoutNoFrameskip-v4", help="OpenAI Atari domain to perform algorithm on.", ) parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument( "--final-exploration-frames", type=int, default=10**6, help="Timesteps after which we stop " + "annealing exploration rate", ) parser.add_argument( "--final-epsilon", type=float, default=0.01, help="Final value of epsilon during training.", ) parser.add_argument( "--eval-epsilon", type=float, default=0.001, help="Exploration epsilon used during eval episodes.", ) parser.add_argument( "--steps", type=int, default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument( "--replay-start-size", type=int, default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, default=3 * 10**4, help="Frequency (in timesteps) at which " + "the target network is updated.", ) parser.add_argument("--demo-n-episodes", type=int, default=30) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument( "--eval-interval", type=int, default=250000, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( "--update-interval", type=int, default=4, help="Frequency (in timesteps) of network updates.", ) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate.") parser.add_argument( "--recurrent", action="store_true", default=False, help="Use a recurrent model. See the code for the model definition.", ) parser.add_argument( "--flicker", action="store_true", default=False, help=("Use so-called flickering Atari, where each" " screen is blacked out with probability 0.5."), ) parser.add_argument( "--no-frame-stack", action="store_true", default=False, help= ("Disable frame stacking so that the agent can only see the current screen." ), ) parser.add_argument( "--episodic-update-len", type=int, default=10, help="Maximum length of sequences for updating recurrent models", ) parser.add_argument( "--batch-size", type=int, default=32, help=("Number of transitions (in a non-recurrent case)" " or sequences (in a recurrent case) used for an" " update."), ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=not args.no_frame_stack, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) print("Observation space", env.observation_space) print("Action space", env.action_space) n_frames = env.observation_space.shape[0] n_actions = env.action_space.n if args.recurrent: # Q-network with LSTM q_func = pfrl.nn.RecurrentSequential( nn.Conv2d(n_frames, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, 3, stride=1), nn.Flatten(), nn.ReLU(), nn.LSTM(input_size=3136, hidden_size=512), nn.Linear(512, n_actions), DiscreteActionValueHead(), ) # Replay buffer that stores whole episodes rbuf = replay_buffers.EpisodicReplayBuffer(10**6) else: # Q-network without LSTM q_func = nn.Sequential( nn.Conv2d(n_frames, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, 3, stride=1), nn.Flatten(), nn.Linear(3136, 512), nn.ReLU(), nn.Linear(512, n_actions), DiscreteActionValueHead(), ) # Replay buffer that stores transitions separately rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) opt = torch.optim.Adam(q_func.parameters(), lr=1e-4, eps=1e-4) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = pfrl.agents.DoubleDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator="mean", phi=phi, minibatch_size=args.batch_size, episodic_update_len=args.episodic_update_len, recurrent=args.recurrent, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.demo_n_episodes, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.demo_n_episodes, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, )