def test_bias_correction_softmax(n_actions): base_policy = nn.Sequential( nn.Linear(1, n_actions), SoftmaxCategoricalHead(), ) another_policy = nn.Sequential( nn.Linear(1, n_actions), SoftmaxCategoricalHead(), ) q_values = torch.rand(1, n_actions) action_value = pfrl.action_value.DiscreteActionValue(q_values) _test_bias_correction(base_policy, another_policy, action_value)
def make_model(self, env): hidden_size = 50 obs_size = env.observation_space.low.size v = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.Tanh(), nn.Linear(hidden_size, 1), ) if self.discrete: n_actions = env.action_space.n pi = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.Tanh(), nn.Linear(hidden_size, n_actions), SoftmaxCategoricalHead(), ) else: action_size = env.action_space.low.size pi = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.Tanh(), nn.Linear(hidden_size, action_size), GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type="diagonal", var_func=lambda x: torch.exp(2 * x), var_param_init=0, ), ) return Branched(pi, v)
def test_compute_loss_with_kl_constraint_softmax(): n_actions = 3 policy = nn.Sequential( nn.Linear(1, n_actions), SoftmaxCategoricalHead(), ) _test_compute_loss_with_kl_constraint(policy)
def __init__(self, n_input, n_action): super().__init__() self.n_input = n_input self.n_action = n_action # network definition self.body_p = self.body() self.body_v = self.body() out_size = self._get_conv_out() # last layers self.policy = nn.Linear(out_size, self.n_action, bias=False) self.value = nn.Linear(out_size, 1, bias=False) # softmax self.softmax = SoftmaxCategoricalHead() # initialize last layers self.policy.weight.data = normalized_columns_initializer( self.policy.weight.data, 0.01) self.value.weight.data = normalized_columns_initializer( self.value.weight.data, 1.0)
def test_load_a3c(self): from pfrl.policies import SoftmaxCategoricalHead obs_size = 4 n_actions = 4 a3c_model = nn.Sequential( nn.Conv2d(obs_size, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), pfrl.nn.Branched( nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), nn.Linear(256, 1), ), ) from pfrl.optimizers import SharedRMSpropEpsInsideSqrt opt = SharedRMSpropEpsInsideSqrt(a3c_model.parameters(), lr=7e-4, eps=1e-1, alpha=0.99) agent = agents.A3C(a3c_model, opt, t_max=5, gamma=0.99, beta=1e-2, phi=lambda x: x) downloaded_model, exists = download_model( "A3C", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def make_model(self, env): hidden_size = 20 obs_size = env.observation_space.low.size def weight_scale(layer, scale): with torch.no_grad(): layer.weight.mul_(scale) return layer if self.recurrent: v = RecurrentSequential( nn.LSTM(num_layers=1, input_size=obs_size, hidden_size=hidden_size), weight_scale(nn.Linear(hidden_size, 1), 1e-1), ) if self.discrete: n_actions = env.action_space.n pi = RecurrentSequential( nn.LSTM(num_layers=1, input_size=obs_size, hidden_size=hidden_size), weight_scale(nn.Linear(hidden_size, n_actions), 1e-1), SoftmaxCategoricalHead(), ) else: action_size = env.action_space.low.size pi = RecurrentSequential( nn.LSTM(num_layers=1, input_size=obs_size, hidden_size=hidden_size), weight_scale(nn.Linear(hidden_size, action_size), 1e-1), GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type="diagonal", var_func=lambda x: torch.exp(2 * x), var_param_init=0, ), ) return RecurrentBranched(pi, v) else: v = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.Tanh(), weight_scale(nn.Linear(hidden_size, 1), 1e-1), ) if self.discrete: n_actions = env.action_space.n pi = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.Tanh(), weight_scale(nn.Linear(hidden_size, n_actions), 1e-1), SoftmaxCategoricalHead(), ) else: action_size = env.action_space.low.size pi = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.Tanh(), weight_scale(nn.Linear(hidden_size, action_size), 1e-1), GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type="diagonal", var_func=lambda x: torch.exp(2 * x), var_param_init=0, ), ) return pfrl.nn.Branched(pi, v)
def _test_abc( self, use_lstm, discrete=True, steps=1000000, require_success=True, gpu=-1 ): def make_env(process_idx, test): size = 2 return ABC( size=size, discrete=discrete, episodic=True, partially_observable=self.use_lstm, deterministic=test, ) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space hidden_size = 20 obs_size = obs_space.low.size if discrete: output_size = action_space.n head = SoftmaxCategoricalHead() else: output_size = action_space.low.size head = GaussianHeadWithStateIndependentCovariance( output_size, var_type="diagonal" ) if use_lstm: model = pfrl.nn.RecurrentSequential( nn.LSTM( num_layers=1, input_size=obs_size, hidden_size=hidden_size, ), nn.Linear(hidden_size, hidden_size), nn.LeakyReLU(), nn.Linear(hidden_size, output_size), head, ) else: model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(), nn.Linear(hidden_size, output_size), head, ) opt = torch.optim.Adam(model.parameters()) beta = 1e-2 agent = pfrl.agents.REINFORCE( model, opt, gpu=gpu, beta=beta, batchsize=self.batchsize, backward_separately=self.backward_separately, act_deterministically=True, recurrent=use_lstm, ) pfrl.experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, False), eval_env=make_env(0, True), outdir=self.outdir, steps=steps, train_max_episode_len=2, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1, ) # Test env = make_env(0, True) n_test_runs = 5 eval_returns, _ = run_evaluation_episodes( env, agent, n_steps=None, n_episodes=n_test_runs, ) if require_success: successful_return = 1 n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) assert n_succeeded == n_test_runs
def main(): parser = argparse.ArgumentParser() parser.add_argument("--processes", type=int, default=16) parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--t-max", type=int, default=5) parser.add_argument("--beta", type=float, default=1e-2) parser.add_argument("--profile", action="store_true") parser.add_argument("--steps", type=int, default=8 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env sample_env = make_env(0, False) obs_size = sample_env.observation_space.low.shape[0] n_actions = sample_env.action_space.n model = nn.Sequential( nn.Conv2d(obs_size, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), pfrl.nn.Branched( nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), nn.Linear(256, 1), ), ) # SharedRMSprop is same as torch.optim.RMSprop except that it initializes # its state in __init__, allowing it to be moved to shared memory. opt = SharedRMSpropEpsInsideSqrt(model.parameters(), lr=7e-4, eps=1e-1, alpha=0.99) assert opt.state_dict()["state"], ( "To share optimizer state across processes, the state must be" " initialized before training.") def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C( model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi, max_grad_norm=40.0, ) if args.load_pretrained: raise Exception("Pretrained models are currently unsupported.") if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_steps: {} mean: {} median: {} stdev: {}".format( args.eval_n_steps, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): for pg in agent.optimizer.param_groups: assert "lr" in pg pg["lr"] = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=True, )
def test_ppo_dataset_recurrent_and_non_recurrent_equivalence( use_obs_normalizer, gamma, lambd, max_recurrent_sequence_len): """Test equivalence between recurrent and non-recurrent datasets. When the same feed-forward model is used, the values of log_prob, v_pred, next_v_pred obtained by both recurrent and non-recurrent dataset creation functions should be the same. """ episodes = make_random_episodes() if use_obs_normalizer: obs_normalizer = pfrl.nn.EmpiricalNormalization(2, clip_threshold=5) obs_normalizer.experience(torch.rand(10, 2)) else: obs_normalizer = None def phi(obs): return (obs * 0.5).astype(np.float32) device = torch.device("cpu") obs_size = 2 n_actions = 3 non_recurrent_model = pfrl.nn.Branched( nn.Sequential( nn.Linear(obs_size, n_actions), SoftmaxCategoricalHead(), ), nn.Linear(obs_size, 1), ) recurrent_model = RecurrentSequential(non_recurrent_model, ) dataset = pfrl.agents.ppo._make_dataset( episodes=copy.deepcopy(episodes), model=non_recurrent_model, phi=phi, batch_states=batch_states, obs_normalizer=obs_normalizer, gamma=gamma, lambd=lambd, device=device, ) dataset_recurrent = pfrl.agents.ppo._make_dataset_recurrent( episodes=copy.deepcopy(episodes), model=recurrent_model, phi=phi, batch_states=batch_states, obs_normalizer=obs_normalizer, gamma=gamma, lambd=lambd, max_recurrent_sequence_len=max_recurrent_sequence_len, device=device, ) assert "log_prob" not in episodes[0][0] assert "log_prob" in dataset[0] assert "log_prob" in dataset_recurrent[0][0] # They are not just shallow copies assert dataset[0]["log_prob"] is not dataset_recurrent[0][0]["log_prob"] states = [tr["state"] for tr in dataset] recurrent_states = [ tr["state"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(states, recurrent_states) actions = [tr["action"] for tr in dataset] recurrent_actions = [ tr["action"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(actions, recurrent_actions) rewards = [tr["reward"] for tr in dataset] recurrent_rewards = [ tr["reward"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(rewards, recurrent_rewards) nonterminals = [tr["nonterminal"] for tr in dataset] recurrent_nonterminals = [ tr["nonterminal"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(nonterminals, recurrent_nonterminals) log_probs = [tr["log_prob"] for tr in dataset] recurrent_log_probs = [ tr["log_prob"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(log_probs, recurrent_log_probs) vs_pred = [tr["v_pred"] for tr in dataset] recurrent_vs_pred = [ tr["v_pred"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(vs_pred, recurrent_vs_pred) next_vs_pred = [tr["next_v_pred"] for tr in dataset] recurrent_next_vs_pred = [ tr["next_v_pred"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(next_vs_pred, recurrent_next_vs_pred) advs = [tr["adv"] for tr in dataset] recurrent_advs = [ tr["adv"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(advs, recurrent_advs) vs_teacher = [tr["v_teacher"] for tr in dataset] recurrent_vs_teacher = [ tr["v_teacher"] for tr in itertools.chain.from_iterable(dataset_recurrent) ] torch_assert_allclose(vs_teacher, recurrent_vs_teacher)
def main(): parser = argparse.ArgumentParser() parser.add_argument("processes", type=int) parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--t-max", type=int, default=5) parser.add_argument("--replay-start-size", type=int, default=10000) parser.add_argument("--n-times-replay", type=int, default=4) parser.add_argument("--beta", type=float, default=1e-2) parser.add_argument("--profile", action="store_true") parser.add_argument("--steps", type=int, default=10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--use-lstm", action="store_true") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.set_defaults(use_lstm=False) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) n_actions = gym.make(args.env).action_space.n input_to_hidden = nn.Sequential( nn.Conv2d(4, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), ) head = acer.ACERDiscreteActionHead( pi=nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), q=nn.Sequential( nn.Linear(256, n_actions), DiscreteActionValueHead(), ), ) if args.use_lstm: model = pfrl.nn.RecurrentSequential( input_to_hidden, nn.LSTM(num_layers=1, input_size=256, hidden_size=256), head, ) else: model = nn.Sequential(input_to_hidden, head) model.apply(pfrl.initializers.init_chainer_default) opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(), lr=args.lr, eps=4e-3, alpha=0.99) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = acer.ACER( model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=phi, max_grad_norm=40, recurrent=args.use_lstm, ) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): for pg in agent.optimizer.param_groups: assert "lr" in pg pg["lr"] = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4", help="Gym Env ID.") parser.add_argument("--gpu", type=int, default=0, help="GPU device ID. Set to -1 to use CPUs only.") parser.add_argument( "--num-envs", type=int, default=8, help="Number of env instances run in parallel.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--steps", type=int, default=10**7, help="Total time steps for training.") parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate.") parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval (in timesteps) between evaluation phases.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes ran in an evaluation phase.", ) parser.add_argument( "--demo", action="store_true", default=False, help="Run demo episodes, not training.", ) parser.add_argument( "--load", type=str, default="", help=("Directory path to load a saved agent data from" " if it is a non-empty string."), ) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument( "--update-interval", type=int, default=128 * 8, help="Interval (in timesteps) between PPO iterations.", ) parser.add_argument( "--batchsize", type=int, default=32 * 8, help="Size of minibatch (in timesteps).", ) parser.add_argument( "--epochs", type=int, default=4, help="Number of epochs used for each PPO iteration.", ) parser.add_argument( "--log-interval", type=int, default=10000, help="Interval (in timesteps) of printing logs.", ) parser.add_argument( "--recurrent", action="store_true", default=False, help="Use a recurrent model. See the code for the model definition.", ) parser.add_argument( "--flicker", action="store_true", default=False, help=("Use so-called flickering Atari, where each" " screen is blacked out with probability 0.5."), ) parser.add_argument( "--no-frame-stack", action="store_true", default=False, help= ("Disable frame stacking so that the agent can only see the current screen." ), ) parser.add_argument( "--checkpoint-frequency", type=int, default=None, help="Frequency at which agents are stored.", ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=False, ) env.seed(env_seed) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): vec_env = pfrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) if not args.no_frame_stack: vec_env = pfrl.wrappers.VectorFrameStack(vec_env, 4) return vec_env sample_env = make_batch_env(test=False) print("Observation space", sample_env.observation_space) print("Action space", sample_env.action_space) n_actions = sample_env.action_space.n obs_n_channels = sample_env.observation_space.low.shape[0] del sample_env def lecun_init(layer, gain=1): if isinstance(layer, (nn.Conv2d, nn.Linear)): pfrl.initializers.init_lecun_normal(layer.weight, gain) nn.init.zeros_(layer.bias) else: pfrl.initializers.init_lecun_normal(layer.weight_ih_l0, gain) pfrl.initializers.init_lecun_normal(layer.weight_hh_l0, gain) nn.init.zeros_(layer.bias_ih_l0) nn.init.zeros_(layer.bias_hh_l0) return layer if args.recurrent: model = pfrl.nn.RecurrentSequential( lecun_init(nn.Conv2d(obs_n_channels, 32, 8, stride=4)), nn.ReLU(), lecun_init(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), lecun_init(nn.Conv2d(64, 64, 3, stride=1)), nn.ReLU(), nn.Flatten(), lecun_init(nn.Linear(3136, 512)), nn.ReLU(), lecun_init(nn.GRU(num_layers=1, input_size=512, hidden_size=512)), pfrl.nn.Branched( nn.Sequential( lecun_init(nn.Linear(512, n_actions), 1e-2), SoftmaxCategoricalHead(), ), lecun_init(nn.Linear(512, 1)), ), ) else: model = nn.Sequential( lecun_init(nn.Conv2d(obs_n_channels, 32, 8, stride=4)), nn.ReLU(), lecun_init(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), lecun_init(nn.Conv2d(64, 64, 3, stride=1)), nn.ReLU(), nn.Flatten(), lecun_init(nn.Linear(3136, 512)), nn.ReLU(), pfrl.nn.Branched( nn.Sequential( lecun_init(nn.Linear(512, n_actions), 1e-2), SoftmaxCategoricalHead(), ), lecun_init(nn.Linear(512, 1)), ), ) opt = torch.optim.Adam(model.parameters(), lr=args.lr, eps=1e-5) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=True, entropy_coef=1e-2, recurrent=args.recurrent, max_grad_norm=0.5, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, ) print("n_runs: {} mean: {} median: {} stdev: {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: step_hooks = [] # Linearly decay the learning rate to zero def lr_setter(env, agent, value): for param_group in agent.optimizer.param_groups: param_group["lr"] = value step_hooks.append( experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter)) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, checkpoint_freq=args.checkpoint_frequency, eval_interval=args.eval_interval, log_interval=args.log_interval, save_best_so_far_agent=False, step_hooks=step_hooks, )
def _test_abc( self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True, ): nproc = 8 def make_env(process_idx, test): size = 2 return ABC( size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test, ) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space replay_buffer = EpisodicReplayBuffer(10**4) obs_size = obs_space.low.size hidden_size = 20 if discrete: n_actions = action_space.n head = acer.ACERDiscreteActionHead( pi=nn.Sequential( nn.Linear(hidden_size, n_actions), SoftmaxCategoricalHead(), ), q=nn.Sequential( nn.Linear(hidden_size, n_actions), DiscreteActionValueHead(), ), ) else: action_size = action_space.low.size head = acer.ACERContinuousActionHead( pi=nn.Sequential( nn.Linear(hidden_size, action_size * 2), GaussianHeadWithDiagonalCovariance(), ), v=nn.Sequential(nn.Linear(hidden_size, 1), ), adv=nn.Sequential( ConcatObsAndAction(), nn.Linear(hidden_size + action_size, 1), ), ) if use_lstm: model = pfrl.nn.RecurrentSequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(), nn.LSTM(num_layers=1, input_size=hidden_size, hidden_size=hidden_size), head, ) else: model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(), head, ) eps = 1e-8 opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(), lr=1e-3, eps=eps, alpha=0.99) gamma = 0.5 beta = 1e-5 if self.n_times_replay == 0 and self.disable_online_update: # At least one of them must be enabled pytest.skip() agent = acer.ACER( model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, beta=beta, n_times_replay=self.n_times_replay, act_deterministically=True, disable_online_update=self.disable_online_update, replay_start_size=100, use_trust_region=self.use_trust_region, recurrent=use_lstm, ) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async( outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1, ) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, "successful")) # Test env = make_env(0, True) n_test_runs = 5 eval_returns = run_evaluation_episodes( env, agent, n_steps=None, n_episodes=n_test_runs, max_episode_len=max_episode_len, ) successful_return = 1 if require_success: n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) assert n_succeeded == n_test_runs
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="CartPole-v0") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--beta", type=float, default=1e-4) parser.add_argument("--batchsize", type=int, default=10) parser.add_argument("--steps", type=int, default=10**5) parser.add_argument("--eval-interval", type=int, default=10**4) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--reward-scale-factor", type=float, default=1e-2) parser.add_argument("--render", action="store_true", default=False) parser.add_argument("--lr", type=float, default=1e-3) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument("--log-level", type=int, default=logging.INFO) parser.add_argument("--monitor", action="store_true") args = parser.parse_args() logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and not test: env = pfrl.wrappers.Render(env) return env train_env = make_env(test=False) timestep_limit = train_env.spec.max_episode_steps obs_space = train_env.observation_space action_space = train_env.action_space obs_size = obs_space.low.size hidden_size = 200 # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Box): model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(0.2), nn.Linear(hidden_size, hidden_size), nn.LeakyReLU(0.2), nn.Linear(hidden_size, action_space.low.size), GaussianHeadWithFixedCovariance(0.3), ) else: model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(0.2), nn.Linear(hidden_size, hidden_size), nn.LeakyReLU(0.2), nn.Linear(hidden_size, action_space.n), SoftmaxCategoricalHead(), ) opt = torch.optim.Adam(model.parameters(), lr=args.lr) agent = pfrl.agents.REINFORCE( model, opt, gpu=args.gpu, beta=args.beta, batchsize=args.batchsize, max_grad_norm=1.0, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=train_env, eval_env=eval_env, outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--outdir", type=str, default="results") parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--steps", type=int, default=8 * 10**7) parser.add_argument("--update-steps", type=int, default=5) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") parser.add_argument("--rmsprop-epsilon", type=float, default=1e-5) parser.add_argument( "--use-gae", action="store_true", default=False, help="use generalized advantage estimation", ) parser.add_argument("--tau", type=float, default=0.95, help="gae parameter") parser.add_argument("--alpha", type=float, default=0.99, help="RMSprop optimizer alpha") parser.add_argument("--eval-interval", type=int, default=10**6) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument("--max-grad-norm", type=float, default=40, help="value loss coefficient") parser.add_argument( "--gpu", "-g", type=int, default=-1, help="GPU ID (negative value indicates CPU)", ) parser.add_argument("--num-envs", type=int, default=1) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.set_defaults(use_lstm=False) args = parser.parse_args() logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(0, test=False) obs_channel_size = sample_env.observation_space.low.shape[0] n_actions = sample_env.action_space.n model = nn.Sequential( nn.Conv2d(obs_channel_size, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), pfrl.nn.Branched( nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), nn.Linear(256, 1), ), ) optimizer = pfrl.optimizers.RMSpropEpsInsideSqrt( model.parameters(), lr=args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha, ) agent = a2c.A2C( model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, phi=phi, use_gae=args.use_gae, tau=args.tau, max_grad_norm=args.max_grad_norm, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, ) print("n_runs: {} mean: {} median: {} stdev: {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )