def evaluate(self, algo, test_rollouts_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate. test_rollouts_per_task (int or None): Number of rollouts per task. """ if test_rollouts_per_task is None: test_rollouts_per_task = self._n_test_rollouts adapted_trajectories = [] logger.log('Sampling for adapation and meta-testing...') if self._test_sampler is None: self._test_sampler = LocalSampler.from_worker_factory( WorkerFactory(seed=get_seed(), max_episode_length=self._max_episode_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=self._test_task_sampler.sample(1)) for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = TrajectoryBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, test_rollouts_per_task * self._max_episode_length, adapted_policy) adapted_trajectories.append(adapted_traj) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, TrajectoryBatch.concatenate(*adapted_trajectories), getattr(algo, 'discount', 1.0), name_map=name_map) self._eval_itr += 1
def to_batch(self): """Convert this in-progress trajectory into a TrajectoryBatch. Returns: TrajectoryBatch: This trajectory as a batch. Raises: AssertionError: If this trajectory contains no time steps. """ assert len(self.rewards) > 0 env_infos = dict(self.env_infos) agent_infos = dict(self.agent_infos) for k, v in env_infos.items(): env_infos[k] = np.asarray(v) for k, v in agent_infos.items(): agent_infos[k] = np.asarray(v) return TrajectoryBatch(env_spec=self.env.spec, observations=np.asarray(self.observations[:-1]), last_observations=np.asarray([self.last_obs]), actions=np.asarray(self.actions), rewards=np.asarray(self.rewards), terminals=np.asarray(self.terminals), env_infos=env_infos, agent_infos=agent_infos, lengths=np.asarray([len(self.rewards)], dtype='l'))
def _obtain_evaluation_samples(self, env, num_trajs=100, max_path_length=1000): """Sample the policy for 10 trajectories and return average values. Args: env (garage.envs.GarageEnv): The environement used to obtain trajectories. num_trajs (int): Number of trajectories. max_path_length (int): Number of maximum steps in one batch. Returns: TrajectoryBatch: Evaluation trajectories, representing the best current performance of the algorithm. """ paths = [] for _ in range(num_trajs): path = rollout(env, self.policy, max_path_length=max_path_length, deterministic=True) paths.append(path) return TrajectoryBatch.from_trajectory_list(self.env_spec, paths)
def obtain_exact_trajectories(self, n_traj_per_worker, agent_update, env_update=None): """Sample an exact number of trajectories per worker. Args: n_traj_per_worker (int): Exact number of trajectories to gather for each worker. agent_update(object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update(object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Returns: TrajectoryBatch: Batch of gathered trajectories. Always in worker order. In other words, first all trajectories from worker 0, then all trajectories from worker 1, etc. """ self._update_workers(agent_update, env_update) batches = [] for worker in self._workers: for _ in range(n_traj_per_worker): batch = worker.rollout() batches.append(batch) return TrajectoryBatch.concatenate(*batches)
def obtain_samples(self, itr, num_samples, agent_update, env_update=None): """Collect at least a given number transitions (timesteps). Args: itr(int): The current iteration number. Using this argument is deprecated. num_samples(int): Minimum number of transitions / timesteps to sample. agent_update(object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update(object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Returns: garage.TrajectoryBatch: The batch of collected trajectories. """ self._update_workers(agent_update, env_update) batches = [] completed_samples = 0 while True: for worker in self._workers: batch = worker.rollout() completed_samples += len(batch.actions) batches.append(batch) if completed_samples > num_samples: return TrajectoryBatch.concatenate(*batches)
def _gather_rollout(self, rollout_number, last_observation): assert 0 < self._path_lengths[ rollout_number] <= self._max_episode_length env_infos = self._env_infos[rollout_number] agent_infos = self._agent_infos[rollout_number] for k, v in env_infos.items(): env_infos[k] = np.asarray(v) for k, v in agent_infos.items(): agent_infos[k] = np.asarray(v) traj = TrajectoryBatch( env_spec=self._envs[rollout_number].spec, observations=np.asarray(self._observations[rollout_number]), last_observations=np.asarray([last_observation]), actions=np.asarray(self._actions[rollout_number]), rewards=np.asarray(self._rewards[rollout_number]), step_types=np.asarray(self._step_types[rollout_number], dtype=StepType), env_infos=dict(env_infos), agent_infos=dict(agent_infos), lengths=np.asarray([self._path_lengths[rollout_number]], dtype='l')) self._completed_rollouts.append(traj) self._observations[rollout_number] = [] self._actions[rollout_number] = [] self._rewards[rollout_number] = [] self._step_types[rollout_number] = [] self._path_lengths[rollout_number] = 0 self._prev_obs[rollout_number] = self._envs[rollout_number].reset() self._env_infos[rollout_number] = collections.defaultdict(list) self._agent_infos[rollout_number] = collections.defaultdict(list)
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic rollouts. Statistics such as (average) discounted return and success rate are recorded. Args: epoch (int): The current training epoch. Returns: float: The average return across self._num_evaluation_trajectories trajectories """ eval_trajs = [] for _ in range(self._num_tasks): eval_trajs.append( obtain_evaluation_samples( self.policy, self._eval_env, num_trajs=self._num_evaluation_trajectories)) eval_trajs = TrajectoryBatch.concatenate(*eval_trajs) last_return = log_multitask_performance(epoch, eval_trajs, self._discount) return last_return
def collect_rollout(self): """Collect the current rollout, clearing the internal buffer. Returns: garage.TrajectoryBatch: A batch of the trajectories completed since the last call to collect_rollout(). """ observations = self._observations self._observations = [] last_observations = self._last_observations self._last_observations = [] actions = self._actions self._actions = [] rewards = self._rewards self._rewards = [] terminals = self._terminals self._terminals = [] env_infos = self._env_infos self._env_infos = defaultdict(list) agent_infos = self._agent_infos self._agent_infos = defaultdict(list) for k, v in agent_infos.items(): agent_infos[k] = np.asarray(v) for k, v in env_infos.items(): env_infos[k] = np.asarray(v) lengths = self._lengths self._lengths = [] return TrajectoryBatch(self.env.spec, np.asarray(observations), np.asarray(last_observations), np.asarray(actions), np.asarray(rewards), np.asarray(terminals), dict(env_infos), dict(agent_infos), np.asarray(lengths, dtype='i'))
def test_agent_infos_batch_mismatch_traj(traj_data): with pytest.raises( ValueError, match='entry in agent_infos must have a batch dimension'): traj_data['agent_infos']['hidden'] = traj_data['agent_infos'][ 'hidden'][:-1] t = TrajectoryBatch(**traj_data) del t
def test_act_box_env_spec_mismatch_traj(traj_data): with pytest.raises(ValueError, match='actions should have'): traj_data['env_spec'].action_space = akro.Box(low=1, high=np.inf, shape=(4, 3, 2), dtype=np.float32) t = TrajectoryBatch(**traj_data) del t
def test_to_trajectory_list(traj_data): t = TrajectoryBatch(**traj_data) t_list = t.to_trajectory_list() assert len(t_list) == len(traj_data['lengths']) start = 0 for length, last_obs, s in zip(traj_data['lengths'], traj_data['last_observations'], t_list): stop = start + length assert ( s['observations'] == traj_data['observations'][start:stop]).all() assert (s['next_observations'] == np.concatenate( (traj_data['observations'][start + 1:stop], [last_obs]))).all() assert (s['actions'] == traj_data['actions'][start:stop]).all() assert (s['rewards'] == traj_data['rewards'][start:stop]).all() assert (s['step_types'] == traj_data['step_types'][start:stop]).all() start = stop assert start == len(traj_data['rewards'])
def test_time_step_batch_from_trajectory_batch(traj_data): traj = TrajectoryBatch(**traj_data) timestep_batch = TimeStepBatch.from_trajectory_batch(traj) assert (timestep_batch.observations == traj.observations).all() assert (timestep_batch.next_observations[:traj.lengths[0] - 1] == traj.observations[1:traj.lengths[0]]).all() assert (timestep_batch.next_observations[traj.lengths[0]] == traj.last_observations[0]).all()
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_episode_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def collect_rollout(self): """Collect the current rollout, clearing the internal buffer. One-hot task id is saved in env_infos['task_onehot']. Latent is saved in agent_infos['latent']. Latent infos are saved in agent_infos['latent_info_name'], where info_name is the original latent info name. Returns: garage.TrajectoryBatch: A batch of the trajectories completed since the last call to collect_rollout(). """ observations = self._observations self._observations = [] last_observations = self._last_observations self._last_observations = [] actions = self._actions self._actions = [] rewards = self._rewards self._rewards = [] step_types = self._step_types self._step_types = [] latents = self._latents self._latents = [] tasks = self._tasks self._tasks = [] env_infos = self._env_infos self._env_infos = defaultdict(list) agent_infos = self._agent_infos self._agent_infos = defaultdict(list) latent_infos = self._latent_infos self._latent_infos = defaultdict(list) for k, v in latent_infos.items(): latent_infos[k] = np.asarray(v) for k, v in agent_infos.items(): agent_infos[k] = np.asarray(v) for k, v in env_infos.items(): env_infos[k] = np.asarray(v) env_infos['task_onehot'] = np.asarray(tasks) agent_infos['latent'] = np.asarray(latents) for k, v in latent_infos.items(): agent_infos['latent_{}'.format(k)] = v lengths = self._lengths self._lengths = [] return TrajectoryBatch(env_spec=self.env.spec, observations=np.asarray(observations), last_observations=np.asarray(last_observations), actions=np.asarray(actions), rewards=np.asarray(rewards), step_types=np.asarray(step_types, dtype=StepType), env_infos=(env_infos), agent_infos=dict(agent_infos), lengths=np.asarray(lengths, dtype='i'))
def test_new(traj_data): t = TrajectoryBatch(**traj_data) assert t.env_spec is traj_data['env_spec'] assert t.observations is traj_data['observations'] assert t.actions is traj_data['actions'] assert t.rewards is traj_data['rewards'] assert t.terminals is traj_data['terminals'] assert t.env_infos is traj_data['env_infos'] assert t.agent_infos is traj_data['agent_infos'] assert t.lengths is traj_data['lengths']
def test_new_traj(traj_data): t = TrajectoryBatch(**traj_data) assert t.env_spec is traj_data['env_spec'] assert t.observations is traj_data['observations'] assert t.last_observations is traj_data['last_observations'] assert t.actions is traj_data['actions'] assert t.rewards is traj_data['rewards'] assert t.env_infos is traj_data['env_infos'] assert t.agent_infos is traj_data['agent_infos'] assert t.step_types is traj_data['step_types'] assert t.lengths is traj_data['lengths']
def test_log_multitask_performance_task_id(): lengths = np.array([10, 5, 1, 1]) batch = TrajectoryBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1], dtype=bool), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool), 'task_id': np.array([1] * 10 + [3] * 5 + [1] + [4]) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_multitask_performance(7, batch, 0.8, { 1: 'env1', 3: 'env2', 4: 'env3', 5: 'env4' }) logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['env1/Iteration'] == 7 assert res['env2/Iteration'] == 7 assert res['env3/Iteration'] == 7 assert res['env4/Iteration'] == 7 assert res['env1/NumTrajs'] == 2 assert res['env2/NumTrajs'] == 1 assert res['env3/NumTrajs'] == 1 assert res['env4/NumTrajs'] == 0 assert math.isclose(res['env1/SuccessRate'], 0.5) assert math.isclose(res['env2/SuccessRate'], 1.0) assert math.isclose(res['env3/SuccessRate'], 1.0) assert math.isnan(res['env4/SuccessRate']) assert math.isnan(res['env4/AverageReturn'])
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ # -- Stage: Calculate baseline paths = [ dict( observations=self._env_spec.observation_space.flatten_n( path['observations']) if self._flatten_input else path['observations'], actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_path_length, baseline_predictions, self._discount, self._gae_lambda) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) self.log_diagnostics(samples_data) logger.log('Optimizing policy...') self.optimize_policy(samples_data) return samples_data['average_return']
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner. """ for epoch in runner.step_epochs(): samples = runner.obtain_samples(epoch) log_performance( epoch, TrajectoryBatch.from_trajectory_list(self.env_spec, samples), self._discount) self._train_once(samples)
def collect_rollout(self): """Collect all completed rollouts. Returns: garage.TrajectoryBatch: A batch of the trajectories completed since the last call to collect_rollout(). """ if len(self._completed_rollouts) == 1: result = self._completed_rollouts[0] else: result = TrajectoryBatch.concatenate(*self._completed_rollouts) self._completed_rollouts = [] return result
def train(self, runner): """Get samples and train the policy. Args: runner (LocalRunner): LocalRunner. """ for epoch in runner.step_epochs(): samples = runner.obtain_samples(epoch) log_performance( epoch, TrajectoryBatch.from_trajectory_list(self.env_spec, samples), self._discount) self._train_once(epoch, samples)
def evaluate(self, algo): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate. """ adapted_trajectories = [] for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = TrajectoryBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, 1, adapted_policy) adapted_trajectories.append(adapted_traj) log_performance(self._eval_itr, TrajectoryBatch.concatenate(*adapted_trajectories), getattr(algo, 'discount', 1.0), prefix=self._prefix) self._eval_itr += 1
def test_log_performance(): lengths = np.array([10, 5, 1, 1]) batch = TrajectoryBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), step_types=np.array( [StepType.FIRST] + [StepType.MID] * (lengths[0] - 2) + [StepType.TERMINAL] + [StepType.FIRST] + [StepType.MID] * (lengths[1] - 2) + [StepType.TERMINAL] + [StepType.FIRST] + [StepType.FIRST], dtype=StepType), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_performance(7, batch, 0.8, prefix='test_log_performance') logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['test_log_performance/Iteration'] == 7 assert res['test_log_performance/NumTrajs'] == 4 assert math.isclose(res['test_log_performance/SuccessRate'], 0.75) assert math.isclose(res['test_log_performance/TerminationRate'], 0.5) assert math.isclose(res['test_log_performance/AverageDiscountedReturn'], 1.1131040640673113) assert math.isclose(res['test_log_performance/AverageReturn'], 2.1659965525) assert math.isclose(res['test_log_performance/StdReturn'], 2.354067152038576)
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ obs, actions, rewards, valids, baselines = self.process_samples( itr, paths) loss = self._compute_loss(itr, obs, actions, rewards, valids, baselines) self._old_policy.load_state_dict(self.policy.state_dict()) self._optimizer.zero_grad() loss.backward() kl_before = self._compute_kl_constraint(obs).detach() self._optimize(itr, obs, actions, rewards, valids, baselines) with torch.no_grad(): loss_after = self._compute_loss(itr, obs, actions, rewards, valids, baselines) kl = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) average_returns = log_performance(itr, TrajectoryBatch.from_trajectory_list( self.env_spec, paths), discount=self.discount) with tabular.prefix(self.policy.name): tabular.record('LossBefore', loss.item()) tabular.record('LossAfter', loss_after.item()) tabular.record('dLoss', loss.item() - loss_after.item()) tabular.record('KLBefore', kl_before.item()) tabular.record('KL', kl.item()) tabular.record('Entropy', policy_entropy.mean().item()) self.baseline.fit(paths) return np.mean(average_returns)
def log_performance(self, itr, all_samples, loss_before, loss_after, kl_before, kl, policy_entropy): """Evaluate performance of this batch. Args: itr (int): Iteration number. all_samples (list[list[MAMLTrajectoryBatch]]): Two dimensional list of MAMLTrajectoryBatch of size [meta_batch_size * (num_grad_updates + 1)] loss_before (float): Loss before optimization step. loss_after (float): Loss after optimization step. kl_before (float): KL divergence before optimization step. kl (float): KL divergence after optimization step. policy_entropy (float): Policy entropy. Returns: float: The average return in last epoch cycle. """ tabular.record('Iteration', itr) name_map = None if hasattr(self._env, 'all_task_names'): names = self._env.all_task_names name_map = dict(zip(names, names)) rtns = log_multitask_performance( itr, TrajectoryBatch.from_trajectory_list( env_spec=self._env.spec, paths=[ path for task_paths in all_samples for path in task_paths[self._num_grad_updates].paths ]), discount=self._inner_algo.discount, name_map=name_map) with tabular.prefix(self._policy.name + '/'): tabular.record('LossBefore', loss_before) tabular.record('LossAfter', loss_after) tabular.record('dLoss', loss_before - loss_after) tabular.record('KLBefore', kl_before) tabular.record('KLAfter', kl) tabular.record('Entropy', policy_entropy) return np.mean(rtns)
def collect_rollout(self): """Gather fragments from all in-progress rollouts. Returns: garage.TrajectoryBatch: A batch of the trajectory fragments. """ for i, frag in enumerate(self._fragments): assert frag.env is self._envs[i] if len(frag.rewards) > 0: complete_frag = frag.to_batch() self._complete_fragments.append(complete_frag) self._fragments[i] = InProgressTrajectory( frag.env, frag.last_obs) assert len(self._complete_fragments) > 0 result = TrajectoryBatch.concatenate(*self._complete_fragments) self._complete_fragments = [] return result
def _gather_rollout(self, rollout_number, last_observation): assert 0 < self._path_lengths[rollout_number] <= self._max_path_length traj = TrajectoryBatch( self._envs[rollout_number].spec, np.asarray(self._observations[rollout_number]), np.asarray([last_observation]), np.asarray(self._actions[rollout_number]), np.asarray(self._rewards[rollout_number]), np.asarray(self._terminals[rollout_number]), self._env_infos[rollout_number], self._agent_infos[rollout_number], np.asarray([self._path_lengths[rollout_number]], dtype='l')) self._completed_rollouts.append(traj) self._observations[rollout_number] = [] self._actions[rollout_number] = [] self._rewards[rollout_number] = [] self._terminals[rollout_number] = [] self._path_lengths[rollout_number] = 0 self._prev_obs[rollout_number] = self._envs[rollout_number].reset()
def _gather_rollout(self, rollout_number, last_observation): assert 0 < self._path_lengths[ rollout_number] <= self._max_episode_length traj = TrajectoryBatch( env_spec=self._envs[rollout_number].spec, observations=np.asarray(self._observations[rollout_number]), last_observations=np.asarray([last_observation]), actions=np.asarray(self._actions[rollout_number]), rewards=np.asarray(self._rewards[rollout_number]), step_types=np.asarray(self._step_types[rollout_number], dtype=StepType), env_infos=self._env_infos[rollout_number], agent_infos=self._agent_infos[rollout_number], lengths=np.asarray([self._path_lengths[rollout_number]], dtype='l')) self._completed_rollouts.append(traj) self._observations[rollout_number] = [] self._actions[rollout_number] = [] self._rewards[rollout_number] = [] self._step_types[rollout_number] = [] self._path_lengths[rollout_number] = 0 self._prev_obs[rollout_number] = self._envs[rollout_number].reset()
def slice_trajectories(trajectories, slice_size): sliced = [] for traj in trajectories.split(): splits = math.ceil(traj.lengths[0] / slice_size) split_indices = np.array_split(np.arange(traj.lengths[0]), splits) next_obs = traj.next_observations for indices in split_indices: last_obs = np.asarray([next_obs[indices[-1]]]) t = TrajectoryBatch( env_spec=traj.env_spec, observations=traj.observations[indices], last_observations=last_obs, actions=traj.actions[indices], rewards=traj.rewards[indices], step_types=traj.step_types[indices], env_infos={k: v[indices] for (k, v) in traj.env_infos}, agent_infos={k: v[indices] for (k, v) in traj.agent_infos}, lengths=np.asarray([len(indices)], dtype='l')) sliced.append(t) return sliced
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) samples_data = self.paths_to_tensors(paths) samples_data['average_return'] = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self.optimize_policy(itr, samples_data) return samples_data['average_return']