def __init__(self, obs_num, n_actions): self.head= A3C_HEAD(obs_num, n_actions) self.pi= policy.FCSoftmaxPolicy( n_actions, n_actions) self.v= v_function.FCVFunction(n_actions) super().__init__(self.head, self.pi, self.v)
def __init__(self, trial, width, height, action_size, lstm_size=128): obs_size = width * height self.head = MyHead(trial, width=width, height=height) self.lstm = L.LSTM(self.head.n_output_channels, lstm_size) self.pi = policies.FCSoftmaxPolicy(lstm_size, action_size) self.v = v_function.FCVFunction(lstm_size) super().__init__(self.head, self.lstm, self.pi, self.v)
def __init__(self, n_actions): self.head = links.NIPSDQNHead() self.pi = policy.FCSoftmaxPolicy(self.head.n_output_channels, n_actions) self.v = v_function.FCVFunction(self.head.n_output_channels) self.lstm = L.LSTM(self.head.n_output_channels, self.head.n_output_channels) super().__init__(self.head, self.lstm, self.pi, self.v)
def __init__(self, n_input, n_actions, n_hidden): self.head = QFunction(n_input, n_hidden) self.pi = policy.FCSoftmaxPolicy( self.head.n_output_channels, n_actions) self.v = v_function.FCVFunction(self.head.n_output_channels) super().__init__(self.head, self.pi, self.v)
def __init__(self, obs_size, action_size, hidden_size=200, lstm_size=128): self.pi_head = L.Linear(obs_size, hidden_size) self.v_head = L.Linear(obs_size, hidden_size) self.pi_lstm = L.LSTM(hidden_size, lstm_size) self.v_lstm = L.LSTM(hidden_size, lstm_size) self.pi = policies.FCGaussianPolicy(lstm_size, action_size) self.v = v_function.FCVFunction(lstm_size) super().__init__(self.pi_head, self.v_head, self.pi_lstm, self.v_lstm, self.pi, self.v)
def __init__(self, obs_size, action_size): self.pi = FCLSTMDeterministicPolicy(n_input_channels=obs_size, action_size=action_size, n_hidden_layers=2, n_hidden_channels=64, min_action=0, max_action=1) self.v = v_function.FCVFunction(obs_size, n_hidden_layers=2, n_hidden_channels=64, nonlinearity=F.tanh) super().__init__(self.pi, self.v)
def __init__(self, obs_size, action_size): self.pi = policies.FCGaussianPolicyWithFixedCovariance( obs_size, action_size, np.log(np.e - 1), n_hidden_layers=2, n_hidden_channels=64) self.v = v_function.FCVFunction(obs_size, n_hidden_layers=2, n_hidden_channels=64, nonlinearity=F.tanh) super().__init__(self.pi, self.v)
def __init__(self, imsize, action_size, L_stages, conditional): self.imsize = imsize self.action_size = action_size self.f = F.relu # activation func for encoding part self.L_stages = L_stages self.conditional = conditional super().__init__() with self.init_scope(): in_channel = 6 if self.conditional else 3 self.c1 = L.Convolution2D(in_channel, 16, stride=1, ksize=3, pad=1) self.c2 = L.Convolution2D(16, 32, stride=2, ksize=3, pad=1) self.c3 = L.Convolution2D(32, 48, stride=2, ksize=2, pad=1) self.c4 = L.Convolution2D(48, 48, stride=2, ksize=2, pad=1) self.c5 = L.Convolution2D(48, 64, stride=2, ksize=2, pad=1) self.c6 = L.Convolution2D(64, self.L_stages+12, stride=2, ksize=2, pad=1) self.bn1 = L.BatchNormalization(32) self.bn2 = L.BatchNormalization(48) self.bn3 = L.BatchNormalization(48) self.bn4 = L.BatchNormalization(64) self.bn5 = L.BatchNormalization(self.L_stages+12) self.v = v_function.FCVFunction(3 * 3 * (self.L_stages+12)) self.dc1 = L.Convolution1D(1, 16, stride=1, ksize=3) self.dc2 = L.Convolution1D(16, 32, stride=1, ksize=3) self.dc3 = L.Convolution1D(32, 48, stride=1, ksize=3) self.dc4 = L.Convolution1D(48, 48, stride=1, ksize=3) self.dc5 = L.Convolution1D(48, 64, stride=1, ksize=3) self.dc6 = L.Convolution1D(64, action_size, stride=1, ksize=3) self.dbn1 = L.BatchNormalization(16) self.dbn2 = L.BatchNormalization(32) self.dbn3 = L.BatchNormalization(48) self.dbn4 = L.BatchNormalization(48) self.dbn5 = L.BatchNormalization(64)
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 2 nonlinearity = F.relu if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, var_wscale=1e-2, var_bias=1, bound_mean=True, min_action=action_space.low, max_action=action_space.high, min_var=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, var_wscale=1e-2, var_bias=1, bound_mean=True, min_action=action_space.low, max_action=action_space.high, min_var=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) eps = 1e-8 if self.backprop_future_values else 1e-1 opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.5 tau = 1e-2 replay_buffer = chainerrl.replay_buffer.EpisodicReplayBuffer(10**5) agent = pcl.PCL(model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, tau=tau, phi=phi, n_times_replay=1, batchsize=self.batchsize, train_async=self.train_async, backprop_future_values=self.backprop_future_values, act_deterministically=True) if self.train_async: with warnings.catch_warnings(record=True) as warns: chainerrl.experiments.train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=2, eval_interval=200, eval_n_runs=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other # processes after success. Thus here the successful model is loaded # explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) else: agent.process_idx = 0 chainerrl.experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, False), eval_env=make_env(0, True), outdir=self.outdir, steps=steps, max_episode_len=2, eval_interval=200, eval_n_runs=5, successful_score=1) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 1 nonlinearity = F.leaky_relu replay_buffer = EpisodicReplayBuffer(10**4) if use_lstm: if discrete: model = acer.ACERSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: if discrete: model = acer.ACERSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) eps = 1e-8 opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.5 beta = 1e-5 if self.n_times_replay == 0 and self.disable_online_update: # At least one of them must be enabled return agent = acer.ACER(model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, beta=beta, phi=phi, n_times_replay=self.n_times_replay, act_deterministically=True, disable_online_update=self.disable_online_update, replay_start_size=100, use_trust_region=self.use_trust_region) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def create_v_function_for_env(env): assert isinstance(env.observation_space, gym.spaces.Box) ndim_obs = env.observation_space.low.size return v_function.FCVFunction(ndim_obs)
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, mean_wscale=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, mean_wscale=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) opt = chainer.optimizers.Adam() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(1)) gamma = 0.8 beta = 1e-2 agent = a3c.A3C(model, opt, t_max=t_max, gamma=gamma, beta=beta, phi=phi, act_deterministically=True) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def __init__(self, n_actions): self.head = ICLRACERHead(activation=guided_relu) self.pi = policy.FCSoftmaxPolicy(self.head.n_output_channels, n_actions) self.v = v_function.FCVFunction(self.head.n_output_channels) super().__init__(self.head, self.pi, self.v)
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=1000000): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) eps = 1e-1 if discrete else 1e-2 opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.9 beta = 1e-2 agent = a3c.A3C(model, opt, t_max=t_max, gamma=gamma, beta=beta, phi=phi, act_deterministically=True) max_episode_len = None if episodic else 2 train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_runs=5, successful_score=1) # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def __init__(self, trial, n_actions, width=None, height=None): self.head = MyHead(trial, width=width, height=height) self.pi = policy.FCSoftmaxPolicy(self.head.n_output_channels, n_actions) self.v = v_function.FCVFunction(self.head.n_output_channels) super().__init__(self.head, self.pi, self.v)