def filter_episodes(traj): """Map TFRecord windows (of adjacent TimeSteps) to single episodes. Outputs the last episode within a sample window. It does this by using the step_type tensor to break up sequences into single episode sections. For example, if step_type is: [FIRST, MID, LAST, FIRST, MID, MID], we will return a sample, whos tensor indices are sampled as: [3, 3, 3, 3, 4, 5]. So that the 3rd index frame is replicated 3 times to across the beginning of the tensor. Args: traj: Trajectory. Returns: Trajectory containing filtered sample with only one episode. """ step_types = traj.step_type seq_len = tf.cast(tf.shape(step_types)[0], tf.int32) # Find the last start frame in the window. e.g. if we have step types # [FIRST, MID, LAST, FIRST, MID, MID], we want index 3. first_frames = tf.where(step_types == StepType.FIRST) if tf.shape(first_frames)[0] == 0: # No first frame, return sequence as is. inds = tf.range(0, seq_len) else: ind_start = tf.cast(first_frames[-1, 0], tf.int32) if ind_start == 0: # Last episode starts on the first frame, return as is. inds = tf.range(0, seq_len) else: # Otherwise, resample so that the last episode's first frame is # replicated to the beginning of the sample. In the example above we want: # [3, 3, 3, 3, 4, 5]. inds_start = tf.tile(ind_start[None], ind_start[None]) inds_end = tf.range(ind_start, seq_len) inds = tf.concat([inds_start, inds_end], axis=0) def _resample(arr): if isinstance(arr, tf.Tensor): return tf.gather(arr, inds) else: return arr # empty or None observation = tf.nest.map_structure(_resample, traj.observation) return Trajectory(step_type=_resample(traj.step_type), action=_resample(traj.action), policy_info=_resample(traj.policy_info), next_step_type=_resample(traj.next_step_type), reward=_resample(traj.reward), discount=_resample(traj.discount), observation=observation)
def make_trajectory_from(transitions: Sequence[Transition]) -> Trajectory: s, a, s_next = zip(*transitions) two_cols = np.ones((len(s), 2)) return Trajectory( step_type=StepType.MID * two_cols, observation=np.stack((s, s_next), axis=1), action=np.stack((a, np.NaN * np.ones_like(a)), axis=1), policy_info=(), next_step_type=StepType.MID * two_cols, reward=np.NaN * two_cols, discount=0.99 * two_cols )
def generate_experience_data(self, steps, save_dir): time_step = self._tf_env.reset() observations = [] actions = [] labels = [] for _ in tqdm(range(steps), 'Generating experience data'): action = self._agent.policy.action(time_step).action time_step = self._tf_env.step(action=action) label = {} if isinstance(self.env._env, DoomEnvironment): state = self._tf_env.envs[0]._game.get_state() self._tf_env.envs[0]._game.advance_action() if state is not None: deamons = [ lbl for lbl in state.labels if lbl.object_name == 'Demon' ] if len(deamons) > 0: label['object_angle'] = int(deamons[0].object_angle) label['distance_from_wall'] = abs( deamons[0].object_position_x) observations.append(time_step.observation) actions.append(action.numpy()[0]) labels.append(label) observations = np.array([ob.numpy()[0] for ob in observations]) actions = np.array(actions) labels = np.array(labels) exp_data = Trajectory(observation=observations, action=actions, policy_info={'satisfaction': labels}, step_type=(), next_step_type=(), reward=(), discount=()) file_path = os.path.join(save_dir, f'exp_data_{steps}.pkl') with file_io.FileIO(file_path, mode='wb') as f: pickle.dump([exp_data], f, protocol=4)
def dummy_trajectory_batch(batch_size=2, n_steps=5, obs_dim=2): observations = tf.reshape( tf.constant(np.arange(batch_size * n_steps * obs_dim), dtype=tf.float32), (batch_size, n_steps, obs_dim), ) time_steps = TimeStep( step_type=tf.constant([[1] * (n_steps - 2) + [2] * 2] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * n_steps] * batch_size, dtype=tf.float32), discount=tf.constant([[1.0] * n_steps] * batch_size, dtype=tf.float32), observation=observations, ) actions = tf.ones((batch_size, n_steps, 1), dtype=tf.float32) action_distribution_parameters = { "dist_params": { "loc": tf.constant([[[10.0]] * n_steps] * batch_size, dtype=tf.float32), "scale": tf.constant([[[10.0]] * n_steps] * batch_size, dtype=tf.float32), }, "value_prediction": tf.constant([[0.0] * n_steps] * batch_size, dtype=tf.float32), } policy_info = action_distribution_parameters return Trajectory( time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount, )
def random_collect_step(environment, policies, replay_buffers): aggregate_time_step = environment.current_time_step() is_first_step = aggregate_time_step.reward.shape == ( 1, ) # first time env reward is [0], while others are [[r1,r2,r3,r4]] aggregate_action_step = {} # action in form of [[e1,e2,e3,e4]] squeezed_action_step = {} # action in form of [e1,e2,e3,e4] if is_first_step: for i, name in enumerate(AGENT_NAMES): # create time_step satisfies the spec of single random policy time_step = TimeStep(aggregate_time_step.step_type[0], aggregate_time_step.reward[0], aggregate_time_step.discount[0], aggregate_time_step.observation[name][0]) # random policy recieve observation(time_step) and return action(action_step) action_step = policies[i].action(time_step) squeezed_action_step[name] = action_step # create action_step(policy_step) satisfies the spec of aggregate environment action = tf.convert_to_tensor([action_step.action], dtype='float32') action_step = policy_step.PolicyStep(action, action_step.state, action_step.info) aggregate_action_step[name] = action_step else: for i, name in enumerate(AGENT_NAMES): # create time_step satisfies the spec of single random policy time_step = TimeStep(aggregate_time_step.step_type[0][0], aggregate_time_step.reward[0][i], aggregate_time_step.discount[0], aggregate_time_step.observation[name][0]) # random policy recieve observation(time_step) and return action(action_step) action_step = policies[i].action(time_step) squeezed_action_step[name] = action_step # create action_step(policy_step) satisfies the spec of aggregate environment action = tf.convert_to_tensor([action_step.action], dtype='float32') action_step = policy_step.PolicyStep(action, action_step.state, action_step.info) aggregate_action_step[name] = action_step # let enviroment take one step forward. aggregate_next_time_step = environment.step(aggregate_action_step) if is_first_step: for i, name in enumerate(AGENT_NAMES): action_step = aggregate_action_step[name] observation = tf.convert_to_tensor( aggregate_time_step.observation[name].numpy(), dtype='float32') tra = Trajectory(aggregate_time_step.step_type[0], observation, action_step.action, action_step.info, aggregate_next_time_step.step_type[0][0], aggregate_time_step.reward[0], aggregate_time_step.discount[0]) replay_buffers[i].add_batch(tra) else: for i, name in enumerate(AGENT_NAMES): action_step = aggregate_action_step[name] observation = tf.convert_to_tensor( aggregate_time_step.observation[name].numpy(), dtype='float32') if aggregate_next_time_step.step_type.shape == (1, ): tra = Trajectory(aggregate_time_step.step_type[0][0], observation, action_step.action, action_step.info, aggregate_next_time_step.step_type[0], aggregate_time_step.reward[0][i], aggregate_time_step.discount[0]) else: tra = Trajectory(aggregate_time_step.step_type[0][0], observation, action_step.action, action_step.info, aggregate_next_time_step.step_type[0][0], aggregate_time_step.reward[0][i], aggregate_time_step.discount[0]) replay_buffers[i].add_batch(tra)
def collect_step(environment, policies, replay_buffers): aggregate_time_step = environment.current_time_step() is_first_step = aggregate_time_step.reward.shape == ( 1, ) # first time env reward is [0], while others are [[r1,r2,r3,r4]] aggregate_action_step = {} # action in form of [[e1,e2,e3,e4]] if is_first_step: for i, name in enumerate(AGENT_NAMES): # abstract observation and construct time step for each agent separately # for the first step, env output step type is in shape [num] observation = tf.convert_to_tensor( aggregate_time_step.observation[name].numpy(), dtype='float32') time_step = TimeStep(aggregate_time_step.step_type[0], aggregate_time_step.reward[0], aggregate_time_step.discount[0], observation) # agent policy receive time_step and output single action action_step = policies[i].action(time_step) # add single action to joint action action = tf.convert_to_tensor(action_step.action, dtype='float32') action_step = policy_step.PolicyStep(action, action_step.state, action_step.info) aggregate_action_step[name] = action_step else: for i, name in enumerate(AGENT_NAMES): # abstract observation and construct time step for each agent separately # for the step other than the first step, env output step type is in shape [[num,num,num,num]] observation = tf.convert_to_tensor( aggregate_time_step.observation[name].numpy(), dtype='float32') time_step = TimeStep(aggregate_time_step.step_type[0][0], aggregate_time_step.reward[0][i], aggregate_time_step.discount[0], observation) # agent policy receive time_step and output single action action_step = policies[i].action(time_step) # add single action to joint action action = tf.convert_to_tensor(action_step.action, dtype='float32') action_step = policy_step.PolicyStep(action, action_step.state, action_step.info) aggregate_action_step[name] = action_step # let enviroment take one step forward. aggregate_next_time_step = environment.step(aggregate_action_step) if is_first_step: for i, name in enumerate(AGENT_NAMES): action_step = aggregate_action_step[name] observation = tf.convert_to_tensor( aggregate_time_step.observation[name].numpy(), dtype='float32') tra = Trajectory(aggregate_time_step.step_type[0], observation, action_step.action, action_step.info, aggregate_next_time_step.step_type[0][0], aggregate_time_step.reward[0], aggregate_time_step.discount[0]) replay_buffers[i].add_batch(tra) else: for i, name in enumerate(AGENT_NAMES): action_step = aggregate_action_step[name] observation = tf.convert_to_tensor( aggregate_time_step.observation[name].numpy(), dtype='float32') if aggregate_next_time_step.step_type.shape == (1, ): tra = Trajectory(aggregate_time_step.step_type[0][0], observation, action_step.action, action_step.info, aggregate_next_time_step.step_type[0], aggregate_time_step.reward[0][i], aggregate_time_step.discount[0]) else: tra = Trajectory(aggregate_time_step.step_type[0][0], observation, action_step.action, action_step.info, aggregate_next_time_step.step_type[0][0], aggregate_time_step.reward[0][i], aggregate_time_step.discount[0]) replay_buffers[i].add_batch(tra)