def test_no_reset(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: # This tests if off-policy sampler respect batch_size # when no_reset is set to True env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True) sampler.start_worker() runner.initialize_tf_vars() paths1 = sampler.obtain_samples(0, 5) paths2 = sampler.obtain_samples(0, 5) len1 = sum([len(path['rewards']) for path in paths1]) len2 = sum([len(path['rewards']) for path in paths2]) assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size' # yapf: disable # When done is False in 1st sampling, the next sampling should be # stacked with the last batch in 1st sampling case1 = (len(paths1[-1]['rewards']) + len(paths2[0]['rewards']) == paths2[0]['running_length']) # When done is True in 1st sampling, the next sampling should be # separated case2 = len(paths2[0]['rewards']) == paths2[0]['running_length'] done = paths1[-1]['dones'][-1] assert ( (not done and case1) or (done and case2) ), 'Running length should be the length of full path' # yapf: enable case1 = np.isclose( paths1[-1]['rewards'].sum() + paths2[0]['rewards'].sum(), paths2[0]['undiscounted_return']) case2 = np.isclose(paths2[0]['rewards'].sum(), paths2[0]['undiscounted_return']) assert ( (not done and case1) or (done and case2) ), 'Undiscounted_return should be the sum of rewards of full path'
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: """ deterministic.set_seed(seed) env.reset() with LocalRunner() as runner: env = TfEnv(env) action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], plot=False, target_update_tau=params['tau'], n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], n_train_steps=params['n_train_steps'], discount=params['discount'], exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') logger.add_output(StdOutput()) logger.add_output(CsvOutput(tabular_log_file)) logger.add_output(TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train( n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) logger.remove_all() return tabular_log_file