def main(args): # environment env = MuJoCoWrapper(gym.make(args.env), args.reward_scale, args.render) env.seed(args.seed) eval_env = MuJoCoWrapper(gym.make(args.env)) eval_env.seed(args.seed) num_actions = env.action_space.shape[0] # network parameters params = TD3NetworkParams(fcs=args.layers, concat_index=args.concat_index, state_shape=env.observation_space.shape, num_actions=num_actions, gamma=args.gamma, tau=args.tau, actor_lr=args.actor_lr, critic_lr=args.critic_lr, target_noise_sigma=args.target_noise_sigma, target_noise_clip=args.target_noise_clip) # deep neural network network = TD3Network(params) # replay buffer buffer = Buffer(args.buffer_size) # metrics saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) # exploration noise noise = NormalActionNoise(np.zeros(num_actions), np.ones(num_actions) * 0.1) # controller controller = TD3Controller(network, buffer, metrics, noise, num_actions, args.batch_size, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) # view view = View(controller) # evaluation eval_controller = EvalController(network, metrics, args.eval_episode) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) interact(env, view, eval_env, eval_view)
def main(args): # environments env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale), args.render) env.seed(args.seed) eval_env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale)) eval_env.seed(args.seed) num_actions = env.action_space.shape[0] # network parameters params = PPONetworkParams(fcs=args.layers, num_actions=num_actions, state_shape=env.observation_space.shape, num_envs=args.num_envs, batch_size=args.batch_size, epsilon=args.epsilon, learning_rate=args.lr, grad_clip=args.grad_clip, value_factor=args.value_factor, entropy_factor=args.entropy_factor) # deep neural network network = PPONetwork(params) # rollout buffer rollout = Rollout() # metrics saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) # controller controller = PPOController(network, rollout, metrics, args.num_envs, args.time_horizon, args.epoch, args.batch_size, args.gamma, args.lam, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) # view view = View(controller) # evaluation eval_controller = EvalController(network, metrics, args.eval_episodes) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) interact(env, view, eval_env, eval_view, batch=True)
def main(args): # environment env = MuJoCoWrapper(gym.make(args.env), args.reward_scale, args.render) eval_env = MuJoCoWrapper(gym.make(args.env)) num_actions = env.action_space.shape[0] # deep neural network network = SACNetwork(args.layers, args.concat_index, env.observation_space.shape, num_actions, args.gamma, args.tau, args.pi_lr, args.q_lr, args.v_lr, args.reg) # replay buffer buffer = Buffer(args.buffer_size) # metrics saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) # exploration noise noise = EmptyNoise() # controller controller = SACController(network, buffer, metrics, noise, num_actions, args.batch_size, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) # view view = View(controller) # evaluation eval_controller = EvalController(network, metrics, args.eval_episode) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) interact(env, view, eval_env, eval_view)
def main(args): env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale), args.render) eval_env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale)) num_actions = env.action_space.shape[0] network = PPONetwork(args.layers, env.observation_space.shape, args.num_envs, num_actions, args.batch_size, args.epsilon, args.lr, args.grad_clip, args.value_factor, args.entropy_factor) rollout = Rollout() saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) controller = PPOController(network, rollout, metrics, args.num_envs, args.time_horizon, args.epoch, args.batch_size, args.gamma, args.lam, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) view = View(controller) eval_controller = EvalController(network, metrics, args.eval_episodes) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) batch_interact(env, view, eval_env, eval_view)
def setup_method(self): self.network = DummyNetwork() self.metrics = DummyMetrics() self.metrics.has = MagicMock(return_value=True) self.controller = EvalController(self.network, self.metrics, 10)
class TestEvalController: def setup_method(self): self.network = DummyNetwork() self.metrics = DummyMetrics() self.metrics.has = MagicMock(return_value=True) self.controller = EvalController(self.network, self.metrics, 10) @pytest.mark.parametrize("batch", [True, False]) def test_step(self, batch): output = make_output() self.network.infer = MagicMock(return_value=output) self.metrics.get = MagicMock(return_value=0) if batch: inpt = list(make_input(batch_size=4, batch=True)) inpt[2] = np.zeros((4, )) else: inpt = list(make_input()) inpt[2] = 0.0 step_output = self.controller.step(*inpt) assert step_output is output.action assert self.network.infer.call_count == 1 if batch: assert self.metrics.get.call_count == 4 else: assert self.metrics.get.call_count == 0 @pytest.mark.parametrize("batch", [True, False]) def test_step_with_done(self, batch): output = make_output() self.network.infer = MagicMock(return_value=output) self.metrics.add = MagicMock() self.metrics.get = MagicMock(return_value=1) reward = np.random.random() if batch: inpt = list(make_input(batch_size=4, batch=True)) index = np.random.randint(4) inpt[2] = np.zeros((4, )) inpt[2][index] = 1.0 inpt[3][index]['reward'] = reward else: inpt = list(make_input()) inpt[2] = 1.0 inpt[3]['reward'] = reward self.controller.step(*inpt) if batch: assert self.metrics.add.call_count == 2 assert list(self.metrics.add.mock_calls[1])[1] == ('eval_reward', reward) assert list(self.metrics.add.mock_calls[0])[1] == ('eval_episode', 1) else: self.metrics.add.assert_not_called() def test_step_with_eval_episode_over_limit(self): output = make_output() self.network.infer = MagicMock(return_value=output) self.metrics.add = MagicMock(side_effect=Exception) self.metrics.get = MagicMock(return_value=10) inpt = list(make_input(batch_size=4, batch=True)) index = np.random.randint(4) reward = np.random.random() inpt[2] = np.zeros((4, )) inpt[2][index] = 1.0 inpt[3][index]['reward'] = reward self.controller.step(*inpt) def test_stop_episode(self): self.metrics.add = MagicMock() obs, reward, _, info = make_input() self.controller.stop_episode(obs, reward, info) assert self.metrics.add.call_count == 2 assert list(self.metrics.add.mock_calls[0])[1] == ('eval_reward', info['reward']) assert list(self.metrics.add.mock_calls[1])[1] == ('eval_episode', 1) def test_should_update(self): assert not self.controller.should_update() def test_update(self): with pytest.raises(Exception): self.controller.update() def test_should_log(self): assert not self.controller.should_log() def test_log(self): with pytest.raises(Exception): self.controller.log() def test_is_finished(self): self.metrics.get = MagicMock(return_value=5) self.metrics.reset = MagicMock() self.metrics.log_metric = MagicMock() assert not self.controller.is_finished() self.metrics.reset.assert_not_called() self.metrics.log_metric.assert_not_called() self.metrics.get = MagicMock(return_value=10) assert self.controller.is_finished() assert list(self.metrics.reset.mock_calls[0])[1] == ('eval_episode', ) assert list(self.metrics.reset.mock_calls[1])[1] == ('eval_reward', ) self.metrics.log_metric.assert_called_once_with('eval_reward', 10) def test_should_save(self): assert not self.controller.should_save() def test_save(self): self.metrics.save_model = MagicMock() self.controller.save() self.metrics.save_model.assert_not_called()