def evaluate(self, preprocessor_serialized: dict) -> Tuple[int, int, Any]:
        """Evaluate one episode of the given environment following the given policy. Remote implementation."""
        preprocessor = BaseWrapper.from_serialization(preprocessor_serialized)

        # reset policy states as it might be recurrent
        self.policy.reset_states()

        done = False
        state = preprocessor.modulate(
            (parse_state(self.env.reset()), None, None, None), update=False)[0]
        cumulative_reward = 0
        steps = 0
        while not done:
            probabilities = flatten(
                self.policy.predict(
                    add_state_dims(parse_state(state),
                                   dims=2 if self.is_recurrent else 1)))

            action, _ = self.distribution.act(*probabilities)
            observation, reward, done, _ = self.env.step(action)
            cumulative_reward += reward
            observation, reward, done, _ = preprocessor.modulate(
                (parse_state(observation), reward, done, None), update=False)

            state = observation
            steps += 1

        eps_class = self.env.unwrapped.current_target_finger if hasattr(
            self.env.unwrapped, "current_target_finger") else None

        return steps, cumulative_reward, eps_class
    def render_fixed_points(self, activations):

        self.env.reset()
        for i in range(100):
            self.env.render()
            activation = activations[0, i, :].reshape(1, 1, self.n_hidden)
            probabilities = flatten(self.sub_model_from.predict(activation))

            action, _ = self.distribution.act(*probabilities)
            observation, reward, done, info = self.env.step(action)
示例#3
0
    def labels(self, all_labels=False):
        labels = [
            self.flat, self.human, self.vehicle, self.construction,
            self.objects, self.nature, self.sky, self.void
        ]
        labels = flatten(labels)

        return labels if all_labels else list(
            filter(
                lambda label: label not in self.labels_ignored_for_evaluation,
                labels))
示例#4
0
def extract_layers(network: tf.keras.Model, unfold_tds: bool = False) -> List[tf.keras.layers.Layer]:
    """Recursively extract layers from a potentially nested list of Sequentials of unknown depth."""
    if not hasattr(network, "layers"):
        return [network]

    layers = []
    for l in network.layers:
        if isinstance(l, tf.keras.Model) or isinstance(l, tf.keras.Sequential):
            layers.append(extract_layers(l))
        elif isinstance(l, tf.keras.layers.TimeDistributed) and unfold_tds:
            if isinstance(l.layer, tf.keras.Model) or isinstance(l.layer, tf.keras.Sequential):
                layers.append(extract_layers(l.layer))
            else:
                layers.append(l.layer)
        else:
            layers.append(l)

    return flatten(layers)
示例#5
0
    def create_episode_gif(self, n: int):
        """Make n GIFs with the current policy."""

        # rebuild model with batch size of 1
        pi, _, _ = self.agent.model_builder(self.env,
                                            **({"bs": 1} if "bs" in fargs(self.agent.model_builder).args else {}))
        pi.set_weights(self.agent.policy.get_weights())

        for j in range(n):
            episode_letter = chr(97 + j)

            # collect an episode
            done = False
            frames = []
            state = parse_state(self.env.reset())
            while not done:
                frames.append(self.env.render(mode="rgb_array"))

                probabilities = flatten(pi.predict(add_state_dims(state, dims=2 if self.agent.is_recurrent else 1)))
                action, _ = self.agent.distribution.act(*probabilities)
                observation, reward, done, _ = self.env.step(
                    numpy.atleast_1d(action) if self.continuous_control else action)
                state = parse_state(observation)

            # the figure
            plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72)
            patch = plt.imshow(frames[0], cmap="Greys" if len(frames[0].shape) == 2 else None)
            plt.axis('off')

            def _animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(), _animate, frames=len(frames), interval=50)
            anim.save(f"{self.story_directory}/iteration_{self.agent.iteration}_{episode_letter}.gif",
                      writer='pillow',
                      fps=25)


            plt.close()
    def collect(self, horizon: int, discount: float, lam: float,
                subseq_length: int, preprocessor_serialized: dict):
        """Collect a batch shard of experience for a given number of timesteps."""

        # import here to avoid pickling errors
        import tensorflow as tfl

        # build new environment for each collector to make multiprocessing possible
        if DETERMINISTIC:
            self.env.seed(1)

        preprocessor = BaseWrapper.from_serialization(preprocessor_serialized)

        # reset states of potentially recurrent net
        self.joint.reset_states()

        # buffer storing the experience and stats
        if self.is_recurrent:
            assert horizon % subseq_length == 0, "Subsequence length would require cutting of part of the observations."
            buffer: TimeSequenceExperienceBuffer = TimeSequenceExperienceBuffer.new(
                env=self.env,
                size=horizon // subseq_length,
                seq_len=subseq_length,
                is_continuous=self.is_continuous,
                is_multi_feature=self.is_shadow_brain)
        else:
            buffer: ExperienceBuffer = ExperienceBuffer.new_empty(
                self.is_continuous, self.is_shadow_brain)

        # go for it
        t, current_episode_return, episode_steps, current_subseq_length = 0, 0, 1, 0
        states, rewards, actions, action_probabilities, values, advantages = [], [], [], [], [], []
        episode_endpoints = []
        state = preprocessor.modulate(
            (parse_state(self.env.reset()), None, None, None))[0]
        while t < horizon:
            current_subseq_length += 1

            # based on the given state, predict action distribution and state value; need flatten due to tf eager bug
            policy_out = flatten(
                self.joint.predict(
                    add_state_dims(parse_state(state),
                                   dims=2 if self.is_recurrent else 1)))
            a_distr, value = policy_out[:-1], policy_out[-1]
            states.append(state)
            values.append(np.squeeze(value))

            # from the action distribution sample an action and remember both the action and its probability
            action, action_probability = self.distribution.act(*a_distr)

            action = action if not DETERMINISTIC else np.zeros(action.shape)
            actions.append(action)
            action_probabilities.append(
                action_probability
            )  # should probably ensure that no probability is ever 0

            # make a step based on the chosen action and collect the reward for this state
            observation, reward, done, _ = self.env.step(
                np.atleast_1d(action) if self.is_continuous else action)
            current_episode_return += reward  # true reward for stats

            observation, reward, done, _ = preprocessor.modulate(
                (parse_state(observation), reward, done, None))
            rewards.append(reward)

            # if recurrent, at a subsequence breakpoint/episode end stack the observations and buffer them
            if self.is_recurrent and (current_subseq_length == subseq_length
                                      or done):
                buffer.push_seq_to_buffer(states, actions,
                                          action_probabilities,
                                          values[-current_subseq_length:])

                # clear the buffered information
                states, actions, action_probabilities = [], [], []
                current_subseq_length = 0

            # depending on whether the state is terminal, choose the next state
            if done:
                episode_endpoints.append(t)

                # calculate advantages for the finished episode, where the last value is 0 since it refers to the
                # terminal state that we just observed
                episode_advantages = estimate_episode_advantages(
                    rewards[-episode_steps:], values[-episode_steps:] + [0],
                    discount, lam)
                episode_returns = episode_advantages + values[-episode_steps:]

                if not self.is_recurrent:
                    advantages.append(episode_advantages)
                else:
                    # skip as many steps as are missing to fill the subsequence, then push adv ant ret to buffer
                    t += subseq_length - (t % subseq_length) - 1
                    buffer.push_adv_ret_to_buffer(episode_advantages,
                                                  episode_returns)

                # reset environment to receive next episodes initial state
                state = preprocessor.modulate(
                    (parse_state(self.env.reset()), None, None, None))[0]
                self.joint.reset_states()

                # update/reset some statistics and trackers
                buffer.episode_lengths.append(episode_steps)
                buffer.episode_rewards.append(current_episode_return)
                buffer.episodes_completed += 1
                episode_steps = 1
                current_episode_return = 0
            else:
                state = observation
                episode_steps += 1

            t += 1

        self.env.close()

        # get last non-visited state's value to incorporate it into the advantage estimation of last visited state
        values.append(
            np.squeeze(
                self.joint.predict(
                    add_state_dims(state,
                                   dims=2 if self.is_recurrent else 1))[-1]))

        # if there was at least one step in the environment after the last episode end, calculate advantages for them
        if episode_steps > 1:
            leftover_advantages = estimate_episode_advantages(
                rewards[-episode_steps + 1:], values[-episode_steps:],
                discount, lam)
            if not self.is_recurrent:
                advantages.append(leftover_advantages)
            else:
                leftover_returns = leftover_advantages + values[
                    -len(leftover_advantages) - 1:-1]
                buffer.push_adv_ret_to_buffer(leftover_advantages,
                                              leftover_returns)

        # if not recurrent, fill the buffer with everything we gathered
        if not self.is_recurrent:
            values = np.array(values, dtype="float32")

            # write to the buffer
            advantages = np.hstack(advantages).astype("float32")
            returns = advantages + values[:-1]
            buffer.fill(
                np.array(states, dtype="float32"),
                np.array(actions,
                         dtype="float32" if self.is_continuous else "int32"),
                np.array(action_probabilities, dtype="float32"), advantages,
                returns, values[:-1])

        # normalize advantages
        buffer.normalize_advantages()

        if self.is_recurrent:
            buffer.inject_batch_dimension()

        # convert buffer to dataset and save it to tf record
        dataset, stats = make_dataset_and_stats(
            buffer, is_shadow_brain=self.is_shadow_brain)
        dataset = dataset.map(tf_serialize_example)

        writer = tfl.data.experimental.TFRecordWriter(
            f"{STORAGE_DIR}/data_{self.id}.tfrecord")
        writer.write(dataset)

        return stats, preprocessor