def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, deterministic=False): """ Generate a sample from a policy. Args: deterministic (bool): Boolean variable indicating whether a stochastic or deterministic action should be taken during the rollout. This is False (stochastic actions) by default. Returns: """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) if deterministic: a = agent_info['mean'] next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return None return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def test_stack_tensor_dict_list(self): results = stack_tensor_dict_list(self.data) assert results['obs'].shape == (2, 3) assert results['act'].shape == (2, 3) assert results['info']['lala'].shape == (2, 2) assert results['info']['baba'].shape == (2, 2) results = stack_tensor_dict_list(self.data2) assert results['obs'].shape == (2, 3) assert results['act'].shape == (2, 3) assert results['info']['lala'].shape == (2, 2) assert results['info']['baba'].shape == (2, )
def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = list(zip(*results)) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list( tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] items = list(zip(ids, obs, rewards, dones, env_infos)) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = list(zip(*items)) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( list(env_infos))
def step(self, action_n): """Step all environments using the provided actions. Inserts an environment infor 'vec_env_executor.complete' containing the episode end signal (time limit reached or done signal from environment). Args: action_n (np.ndarray): Array of actions. Returns: tuple: Tuple containing: * observations (np.ndarray) * rewards (np.ndarray) * dones (np.ndarray): The done signal from the environment. * env_infos (dict[str, np.ndarray]) """ all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list( map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 completes = np.asarray(dones) if self.max_path_length is not None: completes[self.ts >= self.max_path_length] = True for (i, complete) in enumerate(completes): if complete: obs[i] = self.envs[i].reset() self.ts[i] = 0 env_infos[i]['vec_env_executor.complete'] = completes return (obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos))
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) # a = agent_info["mean"] next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 # if d: # break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return None return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def step(self, action_n): all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list( map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: obs[i] = self.envs[i].reset() self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( env_infos)
def worker_run_step(g, action_n, scope): assert hasattr(g, 'parallel_vec_envs') assert scope in g.parallel_vec_envs env_template = g.parallel_vec_env_template[scope] ids = [] step_results = [] for (idx, env) in g.parallel_vec_envs[scope]: action = action_n[idx] ids.append(idx) step_results.append(tuple(env.step(action))) if not step_results: return None obs, rewards, dones, env_infos = list(map(list, list(zip(*step_results)))) obs = env_template.observation_space.flatten_n(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) env_infos = tensor_utils.stack_tensor_dict_list(env_infos) return ids, obs, rewards, dones, env_infos
def step(self, action_n): """Step all environments using the provided actions.""" all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list( map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: reset_new_obs = self.envs[i].reset( ) # Don't modify obs. Fixing bug from original VecEnvExecutor class. if "reset_new_obs" not in env_infos[i]: env_infos[i]["reset_new_obs"] = [] env_infos[i]["reset_new_obs"].append((i, reset_new_obs)) self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( env_infos)
def step(self, action_n, itr=None): """Step all environments using the provided actions.""" all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] # for e in self.envs: # e.render() obs, rewards, dones, env_infos = list( map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: if itr is not None: obs[i] = self.envs[i].reset(epoch=itr) else: obs[i] = self.envs[i].reset() # obs[i] = self.envs[i].reset() self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( env_infos)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths, each path with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray(running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] if not self.no_reset or self._last_obses is None: obses = self.vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def rollout(env, agent, *, max_path_length=np.inf, animated=False, speedup=1, deterministic=False): """Sample a single rollout of the agent in the environment. Args: agent(Policy): Agent used to select actions. env(gym.Env): Environment to perform actions in. max_path_length(int): If the rollout reaches this many timesteps, it is terminated. animated(bool): If true, render the environment after each step. speedup(float): Factor by which to decrease the wait time between rendered steps. Only relevant, if animated == true. deterministic (bool): If true, use the mean action returned by the stochastic policy instead of sampling from the returned action distribution. Returns: dict[str, np.ndarray or dict]: Dictionary, with keys: * observations(np.array): Non-flattened array of observations. There should be one more of these than actions. Note that observations[i] (for i < len(observations) - 1) was used by the agent to choose actions[i]. Should have shape (T + 1, S^*) (the unflattened state space of the current environment). * actions(np.array): Non-flattened array of actions. Should have shape (T, S^*) (the unflattened action space of the current environment). * rewards(np.array): Array of rewards of shape (T,) (1D array of length timesteps). * agent_infos(Dict[str, np.array]): Dictionary of stacked, non-flattened `agent_info` arrays. * env_infos(Dict[str, np.array]): Dictionary of stacked, non-flattened `env_info` arrays. * dones(np.array): Array of termination signals. """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] dones = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < (max_path_length or np.inf): a, agent_info = agent.get_action(o) if deterministic and 'mean' in agent_infos: a = agent_info['mean'] next_o, r, d, env_info = env.step(a) observations.append(o) rewards.append(r) actions.append(a) agent_infos.append(agent_info) env_infos.append(env_info) dones.append(d) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) return dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), dones=np.array(dones), )
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. If batch size is not specified, episode per task by default is 1 so batch size will be meta_batch_size * max_path_length. When number of workers are less than meta batch size, sampling will be performed for each of self._vec_envs_indices in series. The i-th value of self._vec_envs_indices represents the indices of the environments/tasks to be sampled for the i-th iteration. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: OrderedDict: Sample paths. Key represents the index of the environment/task and value represents all the paths sampled from that particular environment/task. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape :math:`[N, S^*]` * actions: numpy.ndarray with shape :math:`[N, S^*]` * rewards: numpy.ndarray with shape :math:`[N, S^*]` * dones: numpy.ndarray with shape :math:`[N, S^*]` * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. """ logger.log('Obtaining samples for iteration %d...' % itr) if batch_size is None: batch_size = self.algo.max_path_length * self._meta_batch_size paths = [] tasks = self.env.sample_tasks(self._meta_batch_size) # Start main loop batch_size_per_loop = batch_size // len(self._vec_envs_indices) for vec_envs_indices in self._vec_envs_indices: self._setup_worker(vec_envs_indices, tasks) n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy # Only reset policies at the beginning of a meta batch policy.reset(dones) while n_samples < batch_size_per_loop: t = time.time() actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step( actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], dones=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['dones'].append(done) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), batch_idx=idx)) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = special.discount_cumsum(path['rewards'], self.discount) returns.append(path['returns']) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict(average_return=np.mean(undiscounted_returns)) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, 'predict_n'): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path['rewards'] + \ self.algo.discount * path_baselines[1:] - path_baselines[:-1] path['advantages'] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path['returns'] = special.discount_cumsum(path['rewards'], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path['returns']) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path['observations'] for path in paths]) actions = tensor_utils.concat_tensor_list( [path['actions'] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path['rewards'] for path in paths]) returns = tensor_utils.concat_tensor_list( [path['returns'] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path['advantages'] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path['env_infos'] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path['agent_infos'] for path in paths]) if self.algo.center_adv: advantages = utils.center_advantages(advantages) if self.algo.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path['advantages']) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path['advantages'] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path['advantages'] - adv_mean) / adv_std for path in paths] else: adv = [path['advantages'] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log('fitting baseline...') if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log('fitted') tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('ExplainedVariance', ev) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def obtain_samples(self, itr, batch_size): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] if not self.no_reset or self._last_obses is None: obses = self.vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self.env_spec, input_obses) if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, obs_normalized, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( obs_normalized) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) new_episode_obs = None if "reset_new_obs" in env_infos: new_episode_obs = next_obses.copy() for i, reset_new_obs in env_infos["reset_new_obs"][0]: new_episode_obs[i] = reset_new_obs del env_infos["reset_new_obs"] #self.vec_env.envs[0].render() env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: payload = { "observation": obses, "action": actions, "reward": rewards * self.algo.reward_scale, "terminal": dones, "next_observation": next_obses } if env_infos and env_infos[0].get("ground_truth_state") is not None: payload["ground_truth_state"] = [env_info.get("ground_truth_state") for env_info in env_infos] self.algo.replay_buffer.add_transitions( **payload ) for idx, reward, env_info, q_val, done in zip(itertools.count(), rewards, env_infos, agent_infos["q_vals"], dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], q_vals=self._last_q_vals[idx].copy(), undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['q_vals'].append(q_val) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_q_vals[idx].append(q_val) self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), q_vals=np.asarray(running_paths[idx]["q_vals"]), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_q_vals[idx] = [] self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() if new_episode_obs: obses = new_episode_obs else: obses = next_obses self._last_obses = obses return paths
def rollout(env, agent, *, max_path_length=np.inf, animated=False, speedup=1, deterministic=False): """Sample a single rollout of the agent in the environment. Args: agent(Policy): Agent used to select actions. env(gym.Env): Environment to perform actions in. max_path_length(int): If the rollout reaches this many timesteps, it is terminated. animated(bool): If true, render the environment after each step. speedup(float): Factor by which to decrease the wait time between rendered steps. Only relevant, if animated == true. deterministic (bool): If true, use the mean action returned by the stochastic policy instead of sampling from the returned action distribution. Returns: worker_number(int): The worker number passed into this function. observations(np.array): Non-flattened array of observations. actions(np.array): Non-flattened array of actions. rewards(np.array): Array of rewards of shape (timesteps, 1). agent_infos(dict[str, np.array]): Dictionary of stacked, non-flattened `agent_info`s. env_infos(dict[str, np.array]): Dictionary of stacked, non-flattened `env_info`s. """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) if deterministic: a = agent_info['mean'] next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, next_observation, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, next_obses, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], next_observations=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["next_observations"].append( next_observation) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), next_observation=tensor_utils.stack_tensor_list( running_paths[idx]["next_observations"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def _rollout( self, env, agent, # self.policy *, skill=-1, max_path_length=np.inf, animated=False, recorded=False, save_media_filename=None, speedup=1, deterministic=False): # TODO: sanity check if agent isinstance of skill algo if skill is -1: skill = self._sample_skill() skills = [] states = [] actions = [] self_rewards = [] env_rewards = [] agent_infos = [] env_infos = [] dones = [] s = env.reset() z = np.eye(self.skills_num)[skill] agent.reset() path_length = 0 video_buffer = [] if recorded and self._is_gym_render is True: video_buffer.append(env.render(mode="rgb_array")) if animated and self._is_gym_render is True: env.render(mode="human") while path_length < (max_path_length or np.inf): s = env.observation_space.flatten(s) a, agent_info = agent.get_action(s, z) if deterministic and 'mean' in agent_infos: a = agent_info['mean'] next_s, env_r, d, env_info = env.step(a) self_r = self._obtain_pseudo_reward(s, skill) states.append(s) self_rewards.append(self_r) env_rewards.append(env_r) actions.append(a) skills.append(skill) agent_infos.append(agent_info) env_infos.append(env_info) dones.append(d) path_length += 1 if d: break s = next_s if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if recorded and self._is_gym_render is True: video_buffer.append(env.render(mode="rgb_array")) if recorded and self._is_gym_render is False: env.render( paths=[ dict( skills=skills, states=states, actions=actions, self_rewards=self_rewards, # squeeze column env_rewards=env_rewards, agent_infos=agent_infos, env_infos=env_infos, dones=dones) ], save_path=save_media_filename) if recorded and self._is_gym_render is True: fps = (1 / self._time_per_render) self._save_video(video_buffer, save_media_filename, fps) return dict( skills=skills, states=np.array(states), actions=np.array(actions), self_rewards=np.array(self_rewards)[:, 0], # squeeze column env_rewards=np.array(env_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), dones=np.array(dones), )
def rollout(env, agent, *, max_episode_length=np.inf, animated=False, speedup=1, deterministic=False): """Sample a single episode of the agent in the environment. Args: agent (Policy): Agent used to select actions. env (Environment): Environment to perform actions in. max_episode_length (int): If the episode reaches this many timesteps, it is truncated. animated (bool): If true, render the environment after each step. speedup (float): Factor by which to decrease the wait time between rendered steps. Only relevant, if animated == true. deterministic (bool): If true, use the mean action returned by the stochastic policy instead of sampling from the returned action distribution. Returns: dict[str, np.ndarray or dict]: Dictionary, with keys: * observations(np.array): Flattened array of observations. There should be one more of these than actions. Note that observations[i] (for i < len(observations) - 1) was used by the agent to choose actions[i]. Should have shape (T + 1, S^*) (the unflattened state space of the current environment). * actions(np.array): Non-flattened array of actions. Should have shape (T, S^*) (the unflattened action space of the current environment). * rewards(np.array): Array of rewards of shape (T,) (1D array of length timesteps). * agent_infos(Dict[str, np.array]): Dictionary of stacked, non-flattened `agent_info` arrays. * env_infos(Dict[str, np.array]): Dictionary of stacked, non-flattened `env_info` arrays. * dones(np.array): Array of termination signals. """ del speedup env_steps = [] agent_infos = [] observations = [] last_obs = env.reset()[0] agent.reset() episode_length = 0 if animated: env.visualize() while episode_length < (max_episode_length or np.inf): a, agent_info = agent.get_action(last_obs) if deterministic and 'mean' in agent_info: a = agent_info['mean'] es = env.step(a) env_steps.append(es) observations.append(last_obs) agent_infos.append(agent_info) episode_length += 1 if es.terminal: break last_obs = es.observation return dict( observations=np.array(observations), actions=np.array([es.action for es in env_steps]), rewards=np.array([es.reward for es in env_steps]), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list( [es.env_info for es in env_steps]), dones=np.array([es.terminal for es in env_steps]), )
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape [Batch, *obs_dims] * actions: numpy.ndarray with shape [Batch, *act_dims] * rewards: numpy.ndarray with shape [Batch, ] * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape [Batch, ?]. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape [Batch, ?]. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. * dones: numpy.ndarray with shape [Batch, ] """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy with click.progressbar(length=batch_size, label='Sampling') as pbar: while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], dones=[]) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) running_paths[idx]['dones'].append(done) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), dones=np.asarray( running_paths[idx]['dones']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.update(len(obses)) obses = next_obses tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) terminals = [path['dones'] for path in paths] valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) lengths = np.asarray([v.sum() for v in valids]) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) undiscounted_returns = self.evaluate_performance( itr, dict(env_spec=None, observations=obs, actions=actions, rewards=rewards, terminals=terminals, env_infos=env_infos, agent_infos=agent_infos, lengths=lengths, discount=self.discount)) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Obtain samples.""" logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self.n_envs paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]['observations']), actions=self.env_spec.action_space.flatten_n( running_paths[idx]['actions']), rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Raises: ValueError: If the algorithm doesn't have an exploration_policy field. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses completes = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.exploration_policy if policy is None: raise ValueError('OffPolicyVectoriizedSampler should only be used ' 'with an exploration_policy.') while n_samples < batch_size: policy.reset(completes) obs_space = self.algo.env_spec.observation_space input_obses = obs_space.flatten_n(obses) actions, agent_infos = policy.get_actions(input_obses) next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) completes = env_infos['vec_env_executor.complete'] self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, reward, env_info, done, obs, next_obs, action in zip( itertools.count(), rewards, env_infos, dones, obses, next_obses, actions): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], observations=[], next_observations=[], actions=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['observations'].append(obs) running_paths[idx]['next_observations'].append(next_obs) running_paths[idx]['actions'].append(action) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) act_space = self._env_spec.action_space path_dict = {} path_dict['observations'] = obs_space.flatten_n( running_paths[idx]['observations']) path_dict['next_observations'] = obs_space.flatten_n( running_paths[idx]['next_observations']) path_dict['rewards'] = np.asarray( running_paths[idx]['rewards']).reshape(-1, 1) path_dict['terminals'] = np.asarray( running_paths[idx]['dones']).reshape(-1, 1) path_dict['actions'] = act_space.flatten_n( running_paths[idx]['actions']) self.algo.replay_buffer.add_path(path_dict) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 obses = next_obses return paths
def obtain_samples(self, itr): """ Collect samples for the given iteration number. :param itr: Iteration number. :return: A list of paths. """ paths = [] obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 batch_samples = self.vec_env.num_envs * self.algo.max_path_length policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_samples: policy.reset(dones) if self.algo.input_include_goal: obs = [obs["observation"] for obs in obses] d_g = [obs["desired_goal"] for obs in obses] a_g = [obs["achieved_goal"] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses if self.algo.es: actions, agent_infos = self.algo.es.get_actions( input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transition( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs["observation"] for next_obs in next_obses ], next_achieved_goal=[ next_obs["achieved_goal"] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transition( observation=obses, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], ) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) if done: paths.append( dict(rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data
def traj_list_to_tensors(paths, max_path_length, baseline_predictions, discount): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_path_length (int): Maximum length of a single rollout. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. Returns: dict: Processed sample data, with key * observations (numpy.ndarray): Padded array of the observations of the environment * actions (numpy.ndarray): Padded array of the actions fed to the the environment * rewards (numpy.ndarray): Padded array of the acquired rewards * agent_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * env_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * rewards (numpy.ndarray): Padded array of the validity information """ baselines = [] returns = [] for idx, path in enumerate(paths): # baselines path["baselines"] = baseline_predictions[idx] baselines.append(path["baselines"]) # returns path["returns"] = tensor_utils.discount_cumsum( path["rewards"], discount) returns.append(path["returns"]) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) samples_data = dict( observations=obs, actions=actions, rewards=rewards, agent_infos=agent_infos, env_infos=env_infos, valids=valids, ) return samples_data
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses completes = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(completes) obs_space = self.algo.env_spec.observation_space input_obses = obs_space.flatten_n(obses) if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) completes = env_infos['vec_env_executor.complete'] self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def skill_rollout(env, agent, max_path_length=np.inf, skill_stopping_func=None, reset_start_rollout=True, keep_rendered_rgbs=False, animated=False, speedup=1 ): """ Perform one rollout in given environment. Code adopted from https://github.com/florensacc/snn4hrl :param env: AsaEnv environment to run in :param agent: Policy to sample actions from :param max_path_length: force terminate the rollout after this many steps :param skill_stopping_func: function ({actions, observations} -> bool) that indicates that skill execution is done :param reset_start_rollout: whether to reset the env when calling this function :param keep_rendered_rgbs: whether to keep a list of all rgb_arrays (for future video making) :param animated: whether to render env after each step :param speedup: speedup factor for animation """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] terminated = [] rendered_rgbs = [] if reset_start_rollout: o = env.reset() else: o = AsaEnv.get_current_obs_wrapped(env) agent.reset() path_length = 0 if animated: env.render() if keep_rendered_rgbs: # will return a new entry to the path dict with all rendered images rendered_rgbs.append(env.render(mode='rgb_array')) while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 # natural termination if d: terminated.append(1) break terminated.append(0) # skill decides to terminate path_dict = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), # here it concatenates all lower-level paths! ) if skill_stopping_func and skill_stopping_func(path_dict): break o = next_o if keep_rendered_rgbs: # will return a new entry to the path dict with all rendered images rendered_rgbs.append(env.render(mode='rgb_array')) if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) # This is off as in the case of being an inner rollout, it will close the outer renderer! # if animated: # env.render(close=True) path_dict = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), # here it concatenates all lower-level paths! # termination indicates if the rollout was terminated or if we simply reached the limit of steps: important # when BOTH happened at the same time, to still be able to know it was the done (for hierarchized envs) terminated=tensor_utils.stack_tensor_list(terminated), ) if keep_rendered_rgbs: path_dict['rendered_rgbs'] = tensor_utils.stack_tensor_list(rendered_rgbs) return path_dict
def obtain_samples_for_evaluation(self, num_paths=20): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] policy = self.algo.policy for i in range(num_paths): obses = self.evaluate_env.reset() #print(obses) dones = np.asarray([True] * self.evaluate_env.num_envs) running_paths = [None] * self.evaluate_env.num_envs policy.reset(dones) end_of_path = False for j in range(500): input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self.env_spec, input_obses) obses = obs_normalized actions = self.algo.policy.get_actions(obs_normalized) if len(actions) > 1: actions = actions[0] agent_infos = None next_obses, rewards, dones, env_infos = self.evaluate_env.step( actions) original_next_obses = next_obses next_obses = tensor_utils.normalize_pixel_batch( self.env_spec, next_obses) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if agent_infos is None: agent_infos = [ dict() for _ in range(self.evaluate_env.num_envs) ] if env_infos is None: env_infos = [ dict() for _ in range(self.evaluate_env.num_envs) ] for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=0, # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=0, success_count=0) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 if done or j == 499: paths.append( dict(rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx] ['success_count'])) running_paths[idx] = None end_of_path = True if end_of_path: break obses = original_next_obses #print(paths) return paths