def evaluate(self, preprocessor_serialized: dict) -> Tuple[int, int, Any]: """Evaluate one episode of the given environment following the given policy. Remote implementation.""" preprocessor = BaseWrapper.from_serialization(preprocessor_serialized) # reset policy states as it might be recurrent self.policy.reset_states() done = False state = preprocessor.modulate( (parse_state(self.env.reset()), None, None, None), update=False)[0] cumulative_reward = 0 steps = 0 while not done: probabilities = flatten( self.policy.predict( add_state_dims(parse_state(state), dims=2 if self.is_recurrent else 1))) action, _ = self.distribution.act(*probabilities) observation, reward, done, _ = self.env.step(action) cumulative_reward += reward observation, reward, done, _ = preprocessor.modulate( (parse_state(observation), reward, done, None), update=False) state = observation steps += 1 eps_class = self.env.unwrapped.current_target_finger if hasattr( self.env.unwrapped, "current_target_finger") else None return steps, cumulative_reward, eps_class
def render_fixed_points(self, activations): self.env.reset() for i in range(100): self.env.render() activation = activations[0, i, :].reshape(1, 1, self.n_hidden) probabilities = flatten(self.sub_model_from.predict(activation)) action, _ = self.distribution.act(*probabilities) observation, reward, done, info = self.env.step(action)
def labels(self, all_labels=False): labels = [ self.flat, self.human, self.vehicle, self.construction, self.objects, self.nature, self.sky, self.void ] labels = flatten(labels) return labels if all_labels else list( filter( lambda label: label not in self.labels_ignored_for_evaluation, labels))
def extract_layers(network: tf.keras.Model, unfold_tds: bool = False) -> List[tf.keras.layers.Layer]: """Recursively extract layers from a potentially nested list of Sequentials of unknown depth.""" if not hasattr(network, "layers"): return [network] layers = [] for l in network.layers: if isinstance(l, tf.keras.Model) or isinstance(l, tf.keras.Sequential): layers.append(extract_layers(l)) elif isinstance(l, tf.keras.layers.TimeDistributed) and unfold_tds: if isinstance(l.layer, tf.keras.Model) or isinstance(l.layer, tf.keras.Sequential): layers.append(extract_layers(l.layer)) else: layers.append(l.layer) else: layers.append(l) return flatten(layers)
def create_episode_gif(self, n: int): """Make n GIFs with the current policy.""" # rebuild model with batch size of 1 pi, _, _ = self.agent.model_builder(self.env, **({"bs": 1} if "bs" in fargs(self.agent.model_builder).args else {})) pi.set_weights(self.agent.policy.get_weights()) for j in range(n): episode_letter = chr(97 + j) # collect an episode done = False frames = [] state = parse_state(self.env.reset()) while not done: frames.append(self.env.render(mode="rgb_array")) probabilities = flatten(pi.predict(add_state_dims(state, dims=2 if self.agent.is_recurrent else 1))) action, _ = self.agent.distribution.act(*probabilities) observation, reward, done, _ = self.env.step( numpy.atleast_1d(action) if self.continuous_control else action) state = parse_state(observation) # the figure plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0], cmap="Greys" if len(frames[0].shape) == 2 else None) plt.axis('off') def _animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), _animate, frames=len(frames), interval=50) anim.save(f"{self.story_directory}/iteration_{self.agent.iteration}_{episode_letter}.gif", writer='pillow', fps=25) plt.close()
def collect(self, horizon: int, discount: float, lam: float, subseq_length: int, preprocessor_serialized: dict): """Collect a batch shard of experience for a given number of timesteps.""" # import here to avoid pickling errors import tensorflow as tfl # build new environment for each collector to make multiprocessing possible if DETERMINISTIC: self.env.seed(1) preprocessor = BaseWrapper.from_serialization(preprocessor_serialized) # reset states of potentially recurrent net self.joint.reset_states() # buffer storing the experience and stats if self.is_recurrent: assert horizon % subseq_length == 0, "Subsequence length would require cutting of part of the observations." buffer: TimeSequenceExperienceBuffer = TimeSequenceExperienceBuffer.new( env=self.env, size=horizon // subseq_length, seq_len=subseq_length, is_continuous=self.is_continuous, is_multi_feature=self.is_shadow_brain) else: buffer: ExperienceBuffer = ExperienceBuffer.new_empty( self.is_continuous, self.is_shadow_brain) # go for it t, current_episode_return, episode_steps, current_subseq_length = 0, 0, 1, 0 states, rewards, actions, action_probabilities, values, advantages = [], [], [], [], [], [] episode_endpoints = [] state = preprocessor.modulate( (parse_state(self.env.reset()), None, None, None))[0] while t < horizon: current_subseq_length += 1 # based on the given state, predict action distribution and state value; need flatten due to tf eager bug policy_out = flatten( self.joint.predict( add_state_dims(parse_state(state), dims=2 if self.is_recurrent else 1))) a_distr, value = policy_out[:-1], policy_out[-1] states.append(state) values.append(np.squeeze(value)) # from the action distribution sample an action and remember both the action and its probability action, action_probability = self.distribution.act(*a_distr) action = action if not DETERMINISTIC else np.zeros(action.shape) actions.append(action) action_probabilities.append( action_probability ) # should probably ensure that no probability is ever 0 # make a step based on the chosen action and collect the reward for this state observation, reward, done, _ = self.env.step( np.atleast_1d(action) if self.is_continuous else action) current_episode_return += reward # true reward for stats observation, reward, done, _ = preprocessor.modulate( (parse_state(observation), reward, done, None)) rewards.append(reward) # if recurrent, at a subsequence breakpoint/episode end stack the observations and buffer them if self.is_recurrent and (current_subseq_length == subseq_length or done): buffer.push_seq_to_buffer(states, actions, action_probabilities, values[-current_subseq_length:]) # clear the buffered information states, actions, action_probabilities = [], [], [] current_subseq_length = 0 # depending on whether the state is terminal, choose the next state if done: episode_endpoints.append(t) # calculate advantages for the finished episode, where the last value is 0 since it refers to the # terminal state that we just observed episode_advantages = estimate_episode_advantages( rewards[-episode_steps:], values[-episode_steps:] + [0], discount, lam) episode_returns = episode_advantages + values[-episode_steps:] if not self.is_recurrent: advantages.append(episode_advantages) else: # skip as many steps as are missing to fill the subsequence, then push adv ant ret to buffer t += subseq_length - (t % subseq_length) - 1 buffer.push_adv_ret_to_buffer(episode_advantages, episode_returns) # reset environment to receive next episodes initial state state = preprocessor.modulate( (parse_state(self.env.reset()), None, None, None))[0] self.joint.reset_states() # update/reset some statistics and trackers buffer.episode_lengths.append(episode_steps) buffer.episode_rewards.append(current_episode_return) buffer.episodes_completed += 1 episode_steps = 1 current_episode_return = 0 else: state = observation episode_steps += 1 t += 1 self.env.close() # get last non-visited state's value to incorporate it into the advantage estimation of last visited state values.append( np.squeeze( self.joint.predict( add_state_dims(state, dims=2 if self.is_recurrent else 1))[-1])) # if there was at least one step in the environment after the last episode end, calculate advantages for them if episode_steps > 1: leftover_advantages = estimate_episode_advantages( rewards[-episode_steps + 1:], values[-episode_steps:], discount, lam) if not self.is_recurrent: advantages.append(leftover_advantages) else: leftover_returns = leftover_advantages + values[ -len(leftover_advantages) - 1:-1] buffer.push_adv_ret_to_buffer(leftover_advantages, leftover_returns) # if not recurrent, fill the buffer with everything we gathered if not self.is_recurrent: values = np.array(values, dtype="float32") # write to the buffer advantages = np.hstack(advantages).astype("float32") returns = advantages + values[:-1] buffer.fill( np.array(states, dtype="float32"), np.array(actions, dtype="float32" if self.is_continuous else "int32"), np.array(action_probabilities, dtype="float32"), advantages, returns, values[:-1]) # normalize advantages buffer.normalize_advantages() if self.is_recurrent: buffer.inject_batch_dimension() # convert buffer to dataset and save it to tf record dataset, stats = make_dataset_and_stats( buffer, is_shadow_brain=self.is_shadow_brain) dataset = dataset.map(tf_serialize_example) writer = tfl.data.experimental.TFRecordWriter( f"{STORAGE_DIR}/data_{self.id}.tfrecord") writer.write(dataset) return stats, preprocessor