예제 #1
0
def filter_episodes(traj):
    """Map TFRecord windows (of adjacent TimeSteps) to single episodes.

  Outputs the last episode within a sample window. It does this by using
  the step_type tensor to break up sequences into single episode sections.
  For example, if step_type is: [FIRST, MID, LAST, FIRST, MID, MID], we
  will return a sample, whos tensor indices are sampled as:
  [3, 3, 3, 3, 4, 5]. So that the 3rd index frame is replicated 3 times to
  across the beginning of the tensor.

  Args:
    traj: Trajectory.

  Returns:
    Trajectory containing filtered sample with only one episode.
  """
    step_types = traj.step_type
    seq_len = tf.cast(tf.shape(step_types)[0], tf.int32)

    # Find the last start frame in the window. e.g. if we have step types
    # [FIRST, MID, LAST, FIRST, MID, MID], we want index 3.
    first_frames = tf.where(step_types == StepType.FIRST)

    if tf.shape(first_frames)[0] == 0:
        # No first frame, return sequence as is.
        inds = tf.range(0, seq_len)
    else:
        ind_start = tf.cast(first_frames[-1, 0], tf.int32)
        if ind_start == 0:
            # Last episode starts on the first frame, return as is.
            inds = tf.range(0, seq_len)
        else:
            # Otherwise, resample so that the last episode's first frame is
            # replicated to the beginning of the sample. In the example above we want:
            # [3, 3, 3, 3, 4, 5].
            inds_start = tf.tile(ind_start[None], ind_start[None])
            inds_end = tf.range(ind_start, seq_len)
            inds = tf.concat([inds_start, inds_end], axis=0)

    def _resample(arr):
        if isinstance(arr, tf.Tensor):
            return tf.gather(arr, inds)
        else:
            return arr  # empty or None

    observation = tf.nest.map_structure(_resample, traj.observation)

    return Trajectory(step_type=_resample(traj.step_type),
                      action=_resample(traj.action),
                      policy_info=_resample(traj.policy_info),
                      next_step_type=_resample(traj.next_step_type),
                      reward=_resample(traj.reward),
                      discount=_resample(traj.discount),
                      observation=observation)
예제 #2
0
def make_trajectory_from(transitions: Sequence[Transition]) -> Trajectory:
    s, a, s_next = zip(*transitions)
    two_cols = np.ones((len(s), 2))
    return Trajectory(
        step_type=StepType.MID * two_cols,
        observation=np.stack((s, s_next), axis=1),
        action=np.stack((a, np.NaN * np.ones_like(a)), axis=1),
        policy_info=(),
        next_step_type=StepType.MID * two_cols,
        reward=np.NaN * two_cols,
        discount=0.99 * two_cols
    )
    def generate_experience_data(self, steps, save_dir):

        time_step = self._tf_env.reset()
        observations = []
        actions = []
        labels = []

        for _ in tqdm(range(steps), 'Generating experience data'):
            action = self._agent.policy.action(time_step).action
            time_step = self._tf_env.step(action=action)

            label = {}
            if isinstance(self.env._env, DoomEnvironment):
                state = self._tf_env.envs[0]._game.get_state()
                self._tf_env.envs[0]._game.advance_action()
                if state is not None:
                    deamons = [
                        lbl for lbl in state.labels
                        if lbl.object_name == 'Demon'
                    ]
                    if len(deamons) > 0:
                        label['object_angle'] = int(deamons[0].object_angle)
                        label['distance_from_wall'] = abs(
                            deamons[0].object_position_x)

            observations.append(time_step.observation)
            actions.append(action.numpy()[0])
            labels.append(label)

        observations = np.array([ob.numpy()[0] for ob in observations])
        actions = np.array(actions)
        labels = np.array(labels)

        exp_data = Trajectory(observation=observations,
                              action=actions,
                              policy_info={'satisfaction': labels},
                              step_type=(),
                              next_step_type=(),
                              reward=(),
                              discount=())

        file_path = os.path.join(save_dir, f'exp_data_{steps}.pkl')
        with file_io.FileIO(file_path, mode='wb') as f:
            pickle.dump([exp_data], f, protocol=4)
예제 #4
0
def dummy_trajectory_batch(batch_size=2, n_steps=5, obs_dim=2):
    observations = tf.reshape(
        tf.constant(np.arange(batch_size * n_steps * obs_dim),
                    dtype=tf.float32),
        (batch_size, n_steps, obs_dim),
    )

    time_steps = TimeStep(
        step_type=tf.constant([[1] * (n_steps - 2) + [2] * 2] * batch_size,
                              dtype=tf.int32),
        reward=tf.constant([[1] * n_steps] * batch_size, dtype=tf.float32),
        discount=tf.constant([[1.0] * n_steps] * batch_size, dtype=tf.float32),
        observation=observations,
    )
    actions = tf.ones((batch_size, n_steps, 1), dtype=tf.float32)

    action_distribution_parameters = {
        "dist_params": {
            "loc":
            tf.constant([[[10.0]] * n_steps] * batch_size, dtype=tf.float32),
            "scale":
            tf.constant([[[10.0]] * n_steps] * batch_size, dtype=tf.float32),
        },
        "value_prediction":
        tf.constant([[0.0] * n_steps] * batch_size, dtype=tf.float32),
    }

    policy_info = action_distribution_parameters

    return Trajectory(
        time_steps.step_type,
        observations,
        actions,
        policy_info,
        time_steps.step_type,
        time_steps.reward,
        time_steps.discount,
    )
def random_collect_step(environment, policies, replay_buffers):

    aggregate_time_step = environment.current_time_step()
    is_first_step = aggregate_time_step.reward.shape == (
        1, )  # first time env reward is [0], while others are [[r1,r2,r3,r4]]
    aggregate_action_step = {}  # action in form of [[e1,e2,e3,e4]]
    squeezed_action_step = {}  # action in form of [e1,e2,e3,e4]
    if is_first_step:
        for i, name in enumerate(AGENT_NAMES):
            # create time_step satisfies the spec of single random policy
            time_step = TimeStep(aggregate_time_step.step_type[0],
                                 aggregate_time_step.reward[0],
                                 aggregate_time_step.discount[0],
                                 aggregate_time_step.observation[name][0])

            # random policy recieve observation(time_step) and return action(action_step)
            action_step = policies[i].action(time_step)
            squeezed_action_step[name] = action_step

            # create action_step(policy_step) satisfies the spec of aggregate environment
            action = tf.convert_to_tensor([action_step.action],
                                          dtype='float32')
            action_step = policy_step.PolicyStep(action, action_step.state,
                                                 action_step.info)
            aggregate_action_step[name] = action_step
    else:
        for i, name in enumerate(AGENT_NAMES):
            # create time_step satisfies the spec of single random policy
            time_step = TimeStep(aggregate_time_step.step_type[0][0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0],
                                 aggregate_time_step.observation[name][0])

            # random policy recieve observation(time_step) and return action(action_step)
            action_step = policies[i].action(time_step)
            squeezed_action_step[name] = action_step

            # create action_step(policy_step) satisfies the spec of aggregate environment
            action = tf.convert_to_tensor([action_step.action],
                                          dtype='float32')
            action_step = policy_step.PolicyStep(action, action_step.state,
                                                 action_step.info)
            aggregate_action_step[name] = action_step

    # let enviroment take one step forward.
    aggregate_next_time_step = environment.step(aggregate_action_step)

    if is_first_step:
        for i, name in enumerate(AGENT_NAMES):
            action_step = aggregate_action_step[name]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            tra = Trajectory(aggregate_time_step.step_type[0], observation,
                             action_step.action, action_step.info,
                             aggregate_next_time_step.step_type[0][0],
                             aggregate_time_step.reward[0],
                             aggregate_time_step.discount[0])
            replay_buffers[i].add_batch(tra)

    else:
        for i, name in enumerate(AGENT_NAMES):
            action_step = aggregate_action_step[name]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            if aggregate_next_time_step.step_type.shape == (1, ):
                tra = Trajectory(aggregate_time_step.step_type[0][0],
                                 observation, action_step.action,
                                 action_step.info,
                                 aggregate_next_time_step.step_type[0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0])
            else:
                tra = Trajectory(aggregate_time_step.step_type[0][0],
                                 observation, action_step.action,
                                 action_step.info,
                                 aggregate_next_time_step.step_type[0][0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0])
            replay_buffers[i].add_batch(tra)
def collect_step(environment, policies, replay_buffers):

    aggregate_time_step = environment.current_time_step()
    is_first_step = aggregate_time_step.reward.shape == (
        1, )  # first time env reward is [0], while others are [[r1,r2,r3,r4]]
    aggregate_action_step = {}  # action in form of [[e1,e2,e3,e4]]

    if is_first_step:
        for i, name in enumerate(AGENT_NAMES):
            # abstract observation and construct time step for each agent separately
            # for the first step, env output step type is in shape [num]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            time_step = TimeStep(aggregate_time_step.step_type[0],
                                 aggregate_time_step.reward[0],
                                 aggregate_time_step.discount[0], observation)

            # agent policy receive time_step and output single action
            action_step = policies[i].action(time_step)

            # add single action to joint action
            action = tf.convert_to_tensor(action_step.action, dtype='float32')
            action_step = policy_step.PolicyStep(action, action_step.state,
                                                 action_step.info)
            aggregate_action_step[name] = action_step
    else:
        for i, name in enumerate(AGENT_NAMES):
            # abstract observation and construct time step for each agent separately
            # for the step other than the first step, env output step type is in shape [[num,num,num,num]]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            time_step = TimeStep(aggregate_time_step.step_type[0][0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0], observation)

            # agent policy receive time_step and output single action
            action_step = policies[i].action(time_step)

            # add single action to joint action
            action = tf.convert_to_tensor(action_step.action, dtype='float32')
            action_step = policy_step.PolicyStep(action, action_step.state,
                                                 action_step.info)
            aggregate_action_step[name] = action_step

    # let enviroment take one step forward.
    aggregate_next_time_step = environment.step(aggregate_action_step)

    if is_first_step:
        for i, name in enumerate(AGENT_NAMES):
            action_step = aggregate_action_step[name]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            tra = Trajectory(aggregate_time_step.step_type[0], observation,
                             action_step.action, action_step.info,
                             aggregate_next_time_step.step_type[0][0],
                             aggregate_time_step.reward[0],
                             aggregate_time_step.discount[0])
            replay_buffers[i].add_batch(tra)

    else:
        for i, name in enumerate(AGENT_NAMES):
            action_step = aggregate_action_step[name]
            observation = tf.convert_to_tensor(
                aggregate_time_step.observation[name].numpy(), dtype='float32')
            if aggregate_next_time_step.step_type.shape == (1, ):
                tra = Trajectory(aggregate_time_step.step_type[0][0],
                                 observation, action_step.action,
                                 action_step.info,
                                 aggregate_next_time_step.step_type[0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0])
            else:
                tra = Trajectory(aggregate_time_step.step_type[0][0],
                                 observation, action_step.action,
                                 action_step.info,
                                 aggregate_next_time_step.step_type[0][0],
                                 aggregate_time_step.reward[0][i],
                                 aggregate_time_step.discount[0])
            replay_buffers[i].add_batch(tra)