def __init__( self, n_dim_obs, n_dim_action, n_hidden_channels, n_hidden_layers, nonlinearity=F.relu, last_wscale=1.0, ): assert n_hidden_layers >= 1 self.n_input_channels = n_dim_obs + n_dim_action self.n_hidden_layers = n_hidden_layers self.n_hidden_channels = n_hidden_channels self.nonlinearity = nonlinearity super().__init__() # No need to pass nonlinearity to obs_mlp because it has no # hidden layers self.obs_mlp = MLP(in_size=n_dim_obs, out_size=n_hidden_channels, hidden_sizes=[]) self.mlp = MLP( in_size=n_hidden_channels + n_dim_action, out_size=1, hidden_sizes=([self.n_hidden_channels] * (self.n_hidden_layers - 1)), nonlinearity=nonlinearity, last_wscale=last_wscale, ) self.output = self.mlp.output
def __init__(self, n_actions, n_input_channels=4, activation=F.relu, bias=0.1, reward_boundaries=None, reward_channel_scale=1.): self.n_actions = n_actions self.n_input_channels = n_input_channels self.activation = activation self.boundaries = torch.from_numpy( np.array(reward_boundaries)) * reward_channel_scale - 1e-8 super().__init__() self.conv_layers = nn.ModuleList([ nn.Conv2d(n_input_channels, 32, 8, stride=4), nn.Conv2d(32, 64, 4, stride=2), nn.Conv2d(64, 64, 3, stride=1), ]) # Modified from 3136 -> 1024 self.a_streams = nn.ModuleList([ MLP(1024, n_actions, [512]) for _ in range(len(self.boundaries) + 1) ]) self.v_streams = nn.ModuleList( [MLP(1024, 1, [512]) for _ in range(len(self.boundaries) + 1)]) self.conv_layers.apply(init_chainer_default) # MLP already applies self.conv_layers.apply(constant_bias_initializer(bias=bias))
def __init__( self, n_dim_obs, n_dim_action, n_hidden_channels, n_hidden_layers, nonlinearity=F.relu, last_wscale=1.0, ): raise NotImplementedError() self.n_input_channels = n_dim_obs + n_dim_action self.n_hidden_layers = n_hidden_layers self.n_hidden_channels = n_hidden_channels self.nonlinearity = nonlinearity super().__init__() self.fc = MLP( self.n_input_channels, n_hidden_channels, [self.n_hidden_channels] * self.n_hidden_layers, nonlinearity=nonlinearity, ) self.lstm = nn.LSTM(num_layers=1, input_size=n_hidden_channels, hidden_size=n_hidden_channels) self.out = nn.Linear(n_hidden_channels, 1) for (n, p) in self.lstm.named_parameters(): if "weight" in n: init_lecun_normal(p) else: nn.init.zeros_(p) init_lecun_normal(self.out.weight, scale=last_wscale) nn.init.zeros_(self.out.bias)
def __init__( self, ndim_obs, n_actions, n_atoms, v_min, v_max, n_hidden_channels, n_hidden_layers, nonlinearity=F.relu, last_wscale=1.0, ): assert n_atoms >= 2 assert v_min < v_max z_values = np.linspace(v_min, v_max, num=n_atoms, dtype=np.float32) model = nn.Sequential( MLP( in_size=ndim_obs, out_size=n_actions * n_atoms, hidden_sizes=[n_hidden_channels] * n_hidden_layers, nonlinearity=nonlinearity, last_wscale=last_wscale, ), Lambda(lambda x: torch.reshape(x, (-1, n_actions, n_atoms))), nn.Softmax(dim=2), ) super().__init__(model=model, z_values=z_values)
def __init__(self, n_actions, n_input_channels=4, activation=F.relu, bias=0.1): self.n_actions = n_actions self.n_input_channels = n_input_channels self.activation = activation super().__init__() self.conv_layers = nn.ModuleList( [ nn.Conv2d(n_input_channels, 32, 8, stride=4), nn.Conv2d(32, 64, 4, stride=2), nn.Conv2d(64, 64, 3, stride=1), ] ) self.a_stream = MLP(3136, n_actions, [512]) self.v_stream = MLP(3136, 1, [512]) self.conv_layers.apply(init_chainer_default) # MLP already applies self.conv_layers.apply(constant_bias_initializer(bias=bias))
def __init__( self, ndim_obs, n_actions, n_hidden_channels, n_hidden_layers, nonlinearity=F.relu, last_wscale=1.0, ): super().__init__(model=MLP( in_size=ndim_obs, out_size=n_actions, hidden_sizes=[n_hidden_channels] * n_hidden_layers, nonlinearity=nonlinearity, last_wscale=last_wscale, ))
def _objective_core( # optuna parameters trial, # training parameters env_id, outdir, seed, monitor, gpu, steps, train_max_episode_len, eval_n_episodes, eval_interval, batch_size, # hyperparameters hyperparams, ): # Set a random seed used in PFRL utils.set_random_seed(seed) # Set different random seeds for train and test envs. train_seed = seed test_seed = 2**31 - 1 - seed def make_env(test=False): env = gym.make(env_id) if not isinstance(env.observation_space, gym.spaces.Box): raise ValueError( "Supported only Box observation environments, but given: {}".format( env.observation_space ) ) if len(env.observation_space.shape) != 1: raise ValueError( "Supported only observation spaces with ndim==1, but given: {}".format( env.observation_space.shape ) ) if not isinstance(env.action_space, gym.spaces.Discrete): raise ValueError( "Supported only discrete action environments, but given: {}".format( env.action_space ) ) env_seed = test_seed if test else train_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if monitor: env = pfrl.wrappers.Monitor(env, outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, hyperparams["reward_scale_factor"]) return env env = make_env(test=False) obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space n_actions = action_space.n # create model & q_function model = MLP( in_size=obs_size, out_size=n_actions, hidden_sizes=hyperparams["hidden_sizes"] ) q_func = q_functions.SingleModelStateQFunctionWithDiscreteAction(model=model) # Use epsilon-greedy for exploration start_epsilon = 1 explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=start_epsilon, end_epsilon=hyperparams["end_epsilon"], decay_steps=hyperparams["decay_steps"], random_action_func=action_space.sample, ) opt = optim.Adam( q_func.parameters(), lr=hyperparams["lr"], eps=hyperparams["adam_eps"] ) rbuf_capacity = steps rbuf = replay_buffers.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=gpu, gamma=hyperparams["gamma"], explorer=explorer, replay_start_size=hyperparams["replay_start_size"], target_update_interval=hyperparams["target_update_interval"], update_interval=hyperparams["update_interval"], minibatch_size=batch_size, ) eval_env = make_env(test=True) evaluation_hooks = [OptunaPrunerHook(trial=trial)] _, eval_stats_history = experiments.train_agent_with_evaluation( agent=agent, env=env, steps=steps, eval_n_steps=None, eval_n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, eval_env=eval_env, train_max_episode_len=train_max_episode_len, evaluation_hooks=evaluation_hooks, ) score = _get_score_from_eval_stats_history(eval_stats_history) return score