def test_local_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = LocalSampler.from_worker_factory(workers, self.policy, self.env) sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples( 0, 1000, tuple(self.algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker()
def test_ray_batch_sampler(self): sampler1 = RaySampler(self.algo, self.env, seed=100, num_processors=1, sampler_worker_cls=SamplerWorker) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples(0, 16) trajs2 = sampler2.obtain_samples(0, 1) assert (trajs1[0]['observations'].shape == np.array( trajs2[0]['observations']).shape == (6, 16)) traj2_action_shape = np.array(trajs2[0]['actions']).shape assert (trajs1[0]['actions'].shape == traj2_action_shape == (6, 4)) assert (sum(trajs1[0]['rewards']) == sum(trajs2[0]['rewards']) == 1) true_obs = np.array( [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]) true_actions = np.array([[0., 0., 1., 0.], [0., 0., 1., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.]]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) for trajectory in trajs1: assert (np.array_equal(trajectory['observations'], true_obs)) assert (np.array_equal(trajectory['actions'], true_actions)) assert (np.array_equal(trajectory['rewards'], true_rewards)) sampler1.shutdown_worker() sampler2.shutdown_worker()
def test_ray_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = RaySampler(workers, self.policy, self.env, num_processors=1) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples( 0, 1000, tuple(self.algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert (trajs1[0]['observations'].shape == np.array( trajs2[0]['observations']).shape == (6, )) traj2_action_shape = np.array(trajs2[0]['actions']).shape assert trajs1[0]['actions'].shape == traj2_action_shape == (6, ) assert sum(trajs1[0]['rewards']) == sum(trajs2[0]['rewards']) == 1 true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) for trajectory in trajs1: assert np.array_equal(trajectory['observations'], true_obs) assert np.array_equal(trajectory['actions'], true_actions) assert np.array_equal(trajectory['rewards'], true_rewards) sampler1.shutdown_worker() sampler2.shutdown_worker()
def test_ray_batch_sampler(ray_local_session_fixture): del ray_local_session_fixture env = TfEnv(GridWorldEnv(desc='4x4')) policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) algo = Mock(env_spec=env.spec, policy=policy, max_path_length=16) assert ray.is_initialized() workers = WorkerFactory(seed=100, max_path_length=algo.max_path_length) sampler1 = RaySampler(workers, policy, env) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(algo, env) sampler2.start_worker() trajs1 = sampler1.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker() env.close()