def setup_method(self): with mock.patch('tensorflow.random.normal') as mock_rand: mock_rand.return_value = 0.5 super().setup_method() self.box_env = TfEnv(DummyBoxEnv()) self.policy1 = ContinuousMLPPolicy( env_spec=self.box_env, hidden_sizes=(32, 32), name='P1') self.policy2 = ContinuousMLPPolicy( env_spec=self.box_env, hidden_sizes=(64, 64), name='P2') self.policy3 = ContinuousMLPPolicyWithModel( env_spec=self.box_env, hidden_sizes=(32, 32), name='P3') self.policy4 = ContinuousMLPPolicyWithModel( env_spec=self.box_env, hidden_sizes=(64, 64), name='P4') self.sess.run(tf.compat.v1.global_variables_initializer()) for a, b in zip(self.policy3.get_params(), self.policy1.get_params()): self.sess.run(a.assign(b)) for a, b in zip(self.policy4.get_params(), self.policy2.get_params()): self.sess.run(a.assign(b)) self.obs = self.box_env.reset() self.action_bound = self.box_env.action_space.high assert self.policy1.vectorized == self.policy2.vectorized assert self.policy3.vectorized == self.policy4.vectorized
def test_is_pickleable(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'continuous_mlp_policy_with_model.MLPModel'), new=SimpleMLPModel): policy = ContinuousMLPPolicyWithModel(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) with tf.compat.v1.variable_scope('ContinuousMLPPolicy/MLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.outputs, feed_dict={policy.model.input: [obs.flatten()]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) output2 = sess.run( policy_pickled.model.outputs, feed_dict={policy_pickled.model.input: [obs.flatten()]}) assert np.array_equal(output1, output2)
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalRunner() as runner: env = TfEnv(normalize(env)) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(ddpg, env) runner.train(n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def test_no_reset(self): with LocalRunner(sess=self.sess) as runner: # This tests if off-policy sampler respect batch_size # when no_reset is set to True env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True) sampler.start_worker() runner.initialize_tf_vars() paths1 = sampler.obtain_samples(0, 5) paths2 = sampler.obtain_samples(0, 5) len1 = sum([len(path['rewards']) for path in paths1]) len2 = sum([len(path['rewards']) for path in paths2]) assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size' # yapf: disable assert (len(paths1[0]['rewards']) + len(paths2[0]['rewards']) == paths2[0]['running_length']), ( 'Running length should be the length of full path') # yapf: enable assert np.isclose( paths1[0]['rewards'].sum() + paths2[0]['rewards'].sum(), paths2[0]['undiscounted_return'] ), 'Undiscounted_return should be the sum of rewards of full path'
def test_get_action(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'continuous_mlp_policy_with_model.MLPModel'), new=SimpleMLPModel): policy = ContinuousMLPPolicyWithModel(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs) expected_action = np.full(action_dim, 0.5) assert env.action_space.contains(action) assert np.array_equal(action, expected_action) actions, _ = policy.get_actions([obs, obs, obs]) for action in actions: assert env.action_space.contains(action) assert np.array_equal(action, expected_action)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, n_epoch_cycles=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100, n_epoch_cycles=20)
def test_get_action_sym(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'continuous_mlp_policy_with_model.MLPModel'), new=SimpleMLPModel): policy = ContinuousMLPPolicyWithModel(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim state_input = tf.placeholder(tf.float32, shape=(None, obs_dim)) action_sym = policy.get_action_sym(state_input, name='action_sym') expected_action = np.full(action_dim, 0.5) action = self.sess.run(action_sym, feed_dict={state_input: [obs.flatten()]}) action = policy.action_space.unflatten(action) assert np.array_equal(action, expected_action) assert env.action_space.contains(action)
def test_ddpg_pendulum(self): """ Test DDPG with Pendulum environment. This environment has a [-3, 3] action_space bound. """ with LocalRunner(sess=self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=100) assert last_avg_ret > 10 env.close()
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicyWithModel(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
class TestContinuousMLPPolicyWithModelTransit(TfGraphTestCase): def setup_method(self): with mock.patch('tensorflow.random.normal') as mock_rand: mock_rand.return_value = 0.5 super().setup_method() self.box_env = TfEnv(DummyBoxEnv()) self.policy1 = ContinuousMLPPolicy( env_spec=self.box_env, hidden_sizes=(32, 32), name='P1') self.policy2 = ContinuousMLPPolicy( env_spec=self.box_env, hidden_sizes=(64, 64), name='P2') self.policy3 = ContinuousMLPPolicyWithModel( env_spec=self.box_env, hidden_sizes=(32, 32), name='P3') self.policy4 = ContinuousMLPPolicyWithModel( env_spec=self.box_env, hidden_sizes=(64, 64), name='P4') self.sess.run(tf.compat.v1.global_variables_initializer()) for a, b in zip(self.policy3.get_params(), self.policy1.get_params()): self.sess.run(a.assign(b)) for a, b in zip(self.policy4.get_params(), self.policy2.get_params()): self.sess.run(a.assign(b)) self.obs = self.box_env.reset() self.action_bound = self.box_env.action_space.high assert self.policy1.vectorized == self.policy2.vectorized assert self.policy3.vectorized == self.policy4.vectorized @mock.patch('numpy.random.normal') def test_get_action(self, mock_rand): mock_rand.return_value = 0.5 action1, _ = self.policy1.get_action(self.obs) action2, _ = self.policy2.get_action(self.obs) action3, _ = self.policy3.get_action(self.obs) action4, _ = self.policy4.get_action(self.obs) assert np.array_equal(action1, action3 * self.action_bound) assert np.array_equal(action2, action4 * self.action_bound) actions1, _ = self.policy1.get_actions([self.obs, self.obs]) actions2, _ = self.policy2.get_actions([self.obs, self.obs]) actions3, _ = self.policy3.get_actions([self.obs, self.obs]) actions4, _ = self.policy4.get_actions([self.obs, self.obs]) assert np.array_equal(actions1, actions3 * self.action_bound) assert np.array_equal(actions2, actions4 * self.action_bound) def test_get_action_sym(self): obs_dim = self.box_env.spec.observation_space.flat_dim state_input = tf.compat.v1.placeholder( tf.float32, shape=(None, obs_dim)) action_sym1 = self.policy1.get_action_sym( state_input, name='action_sym') action_sym2 = self.policy2.get_action_sym( state_input, name='action_sym') action_sym3 = self.policy3.get_action_sym( state_input, name='action_sym') action_sym4 = self.policy4.get_action_sym( state_input, name='action_sym') action1 = self.sess.run( action_sym1, feed_dict={state_input: [self.obs]}) action2 = self.sess.run( action_sym2, feed_dict={state_input: [self.obs]}) action3 = self.sess.run( action_sym3, feed_dict={state_input: [self.obs]}) action4 = self.sess.run( action_sym4, feed_dict={state_input: [self.obs]}) assert np.array_equal(action1, action3 * self.action_bound) assert np.array_equal(action2, action4 * self.action_bound)
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) env.reset() with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') logger.add_output(dowel.StdOutput()) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) logger.remove_all() return tabular_log_file