def run_episode(env: gym.Env, agent: Agent, training: bool, render_mode: str) \ -> List[Trajectory]: ''' Runs a single episode of a single-agent rl loop until termination. :param env: OpenAI gym environment :param agent: Agent policy used to take actions in the environment and to process simulated experiences :param training: (boolean) Whether the agents will learn from the experience they recieve :param render_mode: Feature not implemented (yet!) :returns: Episode trajectory. list of (o,a,r,o') ''' observation = env.reset() done = False trajectory = Trajectory(env_type=EnvType.SINGLE_AGENT) legal_actions: List = None while not done: if agent.requires_environment_model: action = agent.model_based_take_action(deepcopy(env), observation, player_index=0) else: action = agent.model_free_take_action(observation, legal_actions) succ_observation, reward, done, info = env.step(action) trajectory.add_timestep(observation, action, reward, succ_observation, done) if training: agent.handle_experience(observation, action, reward, succ_observation, done) observation = succ_observation if 'legal_actions' in info: legal_actions = info['legal_actions'] return trajectory
def _check_reset_seed(env: gym.Env, seed: Optional[int] = None) -> None: """ Check that the environment can be reset with a random seed. """ signature = inspect.signature(env.reset) assert ( "seed" in signature.parameters or "kwargs" in signature.parameters ), "The environment cannot be reset with a random seed. This behavior will be deprecated in the future." try: env.reset(seed=seed) except TypeError as e: raise AssertionError( "The environment cannot be reset with a random seed, even though `seed` or `kwargs` " "appear in the signature. This should never happen, please report this issue. " "The error was: " + str(e)) if env.unwrapped.np_random is None: logger.warn( "Resetting the environment did not result in seeding its random number generator. " "This is likely due to not calling `super().reset(seed=seed)` in the `reset` method. " "If you do not use the python-level random number generator, this is not a problem." ) seed_param = signature.parameters.get("seed") # Check the default value is None if seed_param is not None and seed_param.default is not None: logger.warn( "The default seed argument in reset should be `None`, " "otherwise the environment will by default always be deterministic" )
def _check_render(env: gym.Env, warn: bool = True, headless: bool = False) -> None: """ Check the declared render modes and the `render()`/`close()` method of the environment. :param env: The environment to check :param warn: Whether to output additional warnings :param headless: Whether to disable render modes that require a graphical interface. False by default. """ render_modes = env.metadata.get("render.modes") if render_modes is None: if warn: warnings.warn( "No render modes was declared in the environment " " (env.metadata['render.modes'] is None or not defined), " "you may have trouble when calling `.render()`") else: # Don't check render mode that require a # graphical interface (useful for CI) if headless and "human" in render_modes: render_modes.remove("human") # Check all declared render modes for render_mode in render_modes: env.render(mode=render_mode) env.close()
def iterate_batches(env: gym.Env, net: nn.Module, batch_size: int): batch = [] episode_reward = 0.0 episode_steps = [] obs = env.reset() sm = nn.Softmax(dim=1) while True: obs_v = torch.FloatTensor([obs]) act_probs_v = sm(net(obs_v)) act_probs = act_probs_v.data.numpy()[0] action = np.random.choice(len(act_probs), p=act_probs) next_obs, reward, done, _ = env.step(action) episode_reward += reward step = EpisodeStep(obs, action) episode_steps.append(step) if done: e = Episode(episode_reward, episode_steps) batch.append(e) episode_reward = 0.0 episode_steps = [] next_obs = env.reset() if len(batch) == batch_size: yield batch batch = [] obs = next_obs
def train(agent: Agent, env: gym.Env, episodes: int, render=True): """Train `agent` in `env` for `episodes` Args: agent (Agent): Agent to train env (gym.Env): Environment to train the agent episodes (int): Number of episodes to train render (bool): True=Enable/False=Disable rendering; Default=True """ for episode in range(episodes): done = False state = env.reset() total_reward = 0 rewards = [] states = [] actions = [] while not done: action = agent.get_action(state) next_state, reward, done, _ = env.step(action) rewards.append(reward) states.append(state) actions.append(action) state = next_state total_reward += reward if render: env.render() if done: agent.learn(states, rewards, actions) print("\n") print(f"Episode#:{episode} ep_reward:{total_reward}", end="\r")
def run_episode(env: Env, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None) -> Trajectory: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() obs = env.reset() terminal = False num_steps = 0 while not terminal: action = agent.act(obs) next_obs, reward, terminal, _ = env.step(action) if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=reward, terminal=terminal, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs num_steps += 1 return trajectory
def render_episode(env: gym.Env, model: tf.keras.Model, max_steps: int): state = tf.constant(env.reset(), dtype=tf.float32) screen = env.render(mode='rgb_array') im = Image.fromarray(screen) images = [im] for i in range(1, max_steps + 1): state = tf.expand_dims(state, 0) qvalues = model(state) action = np.argmax(np.squeeze(qvalues)) state, _, done, _ = env.step(action) state = tf.constant(state, dtype=tf.float32) # Render screen every 10 steps if you put % 10 there if i % 1 == 0: screen = env.render(mode='rgb_array') images.append(Image.fromarray(screen)) if done: break return images
def gather_experience(self, env: gym.Env, time_limit: int) -> float: state = env.reset() done = False total_reward, reward, timesteps = 0, 0, 0 while not done: # print(action_probs) action_probs = self.action_probs(state) # print(action_probs) action_chosen = np.random.choice(self.action_space, p=action_probs) self.memory_buffer.update(state, action_chosen, action_probs, reward) state, reward, done, info = env.step(action_chosen) total_reward += reward timesteps += 1 if timesteps >= time_limit: break if not done: self.memory_buffer.rewards[-1] = -(1 / (1 - self.GAMMA)) # self.memory_buffer.rewards[-1] = -5000 env.close() # print("Episode of experience over, total reward = ", total_reward) return total_reward
def generate_gif(env: gym.Env, n_steps: int = 20, suffix: str = "smm_env_", **kwargs) -> None: """Plot a few steps of an env and generate a .gif.""" tmp_dir = tempfile.TemporaryDirectory() _ = env.reset() for s in tqdm(range(n_steps)): obs, _, _, _ = env.step(5) fig, ax = plot_smm_obs(obs, **kwargs) fig.suptitle(f"Step: {s}") fig.tight_layout() fig.savefig(f'{os.path.join(tmp_dir.name, str(s))}.png') plt.close('all') fns = glob.glob(f'{tmp_dir.name}/*.png') sorted_idx = np.argsort([ int(f.split(tmp_dir.name)[1].split('.png')[0].split('/')[1]) for f in fns ]) fns = np.array(fns)[sorted_idx] output_path = f"{suffix}replay.gif" images = [imageio.imread(f) for f in fns] imageio.mimsave(output_path, images, duration=0.1, subrectangles=True) tmp_dir.cleanup()
def train_setup(agent: Agent, env: gym.Env): num_games = 150 n_step = 0 best_score = 0 scores = [] for i in range(num_games): done = False state = env.reset() score = 0 while not done: action = agent.choose_action(state) new_state, reward, done, _ = env.step(action) n_step += 1 score += reward agent.remember(state, action, reward, new_state, int(done)) agent.learn() state = new_state if score == 200: print(f"Score maxed") done = True scores.append(score) avg_score = np.mean(scores[-100:]) print( f"Game: {i}\tScore: {score}\tEpsilon: {agent.epsilon}'\tAverage score: {avg_score}" ) if avg_score > best_score: print("New best average.") agent.save_models() best_score = avg_score
def set_weights(model: tf.keras.Model, env: gym.Env, num_steps: int = 100, set_bias: bool = True, set_weight: bool = True, env_max_steps: int = None): rewards = gather_data(env, num_steps, env_max_steps) output_layer: tf.keras.layers.Layer = model.layers[-1] while hasattr(output_layer, "layers"): output_layer = output_layer.layers[-1] W, b = output_layer.trainable_weights if set_bias: reward_mean = tf.reduce_mean(rewards) new_bias = tf.fill(b.shape, reward_mean) print( f" [Trickster] - Clever bias init: shape: {b.shape} value: {reward_mean}" ) b.assign(new_bias) if set_weight: reward_std = tf.math.reduce_std(rewards) orthogonal_initializer = tf.keras.initializers.Orthogonal( gain=reward_std) print( f" [Trickster] - Clever weight init: shape: {W.shape} gain: {reward_std}" ) new_weight = orthogonal_initializer(W.shape) W.assign(new_weight) env.reset()
def _create_replay_buffer_and_insert(env: gym.Env): env.seed(1) replay_buffer = ReplayBuffer.create_from_env( env, replay_memory_size=10, batch_size=1 ) replay_buffer_inserter = make_replay_buffer_inserter(env) obs = env.reset() inserted = [] terminal = False i = 0 while not terminal and i < 5: logger.info(f"Iteration: {i}") action = env.action_space.sample() next_obs, reward, terminal, _ = env.step(action) inserted.append( { "observation": obs, "action": action, "reward": reward, "terminal": terminal, } ) log_prob = 0.0 replay_buffer_inserter(replay_buffer, obs, action, reward, terminal, log_prob) obs = next_obs i += 1 return replay_buffer, inserted
def play_episode(env: gym.Env, agent: Agent, replay_memory: ReplayMemory, eps: float, batch_size: int) -> int: """Play an epsiode and train Args: env (gym.Env): gym environment (CartPole-v0) agent (Agent): agent will train and get action replay_memory (ReplayMemory): trajectory is saved here eps (float): 𝜺-greedy for exploration batch_size (int): batch size Returns: int: reward earned in this episode """ s = env.reset() done = False total_reward = 0 while not done: a = agent.get_action(s, eps) s2, r, done, info = env.step(a) total_reward += r if done: r = -1 replay_memory.push(s, a, r, s2, done) if len(replay_memory) > batch_size: minibatch = replay_memory.pop(batch_size) train_helper(agent, minibatch, FLAGS.gamma) s = s2 return total_reward
def test(self, env: gym.Env, policy: Optional[Policy] = None, nb_episodes: int = 10, seed: Optional[int] = None, experiment_name: str = "", **_kwargs) -> History: """Test the agent.""" if policy is None: policy = GreedyPolicy() policy.action_space = env.action_space policy.model = self set_seed(seed) set_env_seed(seed, env) history: List[List[AgentObs]] = [] current_episode: List[AgentObs] = [] for _ in range(nb_episodes): done = False s = env.reset() while not done: a = policy.get_action(s) sp, r, done, info = env.step(a) current_episode.append((s, a, r, sp)) s = sp history.append(current_episode) current_episode = [] return History(history, is_training=False, seed=seed, name=experiment_name)
def test( self, env: gym.Env, callbacks: Collection[Callback] = (), nb_episodes: int = 1000, experiment_name: str = "", ): """Test.""" context = Context(experiment_name, self, env, callbacks) context.on_training_begin() for episode in range(nb_episodes): self.current_state = 0 state = env.reset() done = False step = 0 context.on_episode_begin(episode) while not done: action = self.choose_best_action(state) context.on_step_begin(step, action) state2, reward, done, info = env.step(action) observation = (state, action, reward, state2, done) self.do_pdfa_transition(observation) context.on_step_end(step, observation) state = state2 step += 1 context.on_episode_end(episode) context.on_training_end()
def __init__(self, env, max_length=np.inf, dense_reward=True, save_fr=10, save_dest="state_box", render=False): Env.__init__(self) DictSerializable.__init__(self, DictSerializable.get_numpy_save()) self.eval_env = env # Define action and observation space # They must be gym.spaces objects # Example when using discrete actions: if env is not None: self.action_space = self.eval_env.action_space # Example for using image as input: self.observation_space = self.eval_env.observation_space self._dense_reward = dense_reward self.partial_reward = 0. self.partial_length = 0 self.returns = [] self.episode_lengths = [] self.successes = [] self._unused = True self._max_length = max_length self.max_episode_steps = max_length self._save_fr = save_fr self._save_dest = save_dest self._render = render
def train( self, env: gym.Env, callbacks: Collection[Callback] = (), nb_episodes: int = 1000, experiment_name: str = "", ): """Train.""" context = Context(experiment_name, self, env, callbacks) context.on_training_begin() for episode in range(nb_episodes): state = env.reset() done = False step = 0 self._episode_reset() self.stop = self._should_stop() context.on_episode_begin(episode) while not done and not self.done(): action = random_policy(self, state) context.on_step_begin(step, action) state2, reward, done, info = env.step(action) observation = (state, action, reward, state2, done) self.observe(observation) context.on_step_end(step, observation) state = state2 step += 1 self._add_trace() if episode % self.update_frequency == 0: self._learn_pdfa() context.on_episode_end(episode) context.on_training_end()
def play_episode(env: gym.Env, agent: Agent, replay_memory: ReplayMemory, eps: float, batch_size: int, gamma: float) -> int: """Play an episode and train Args: env (gym.Env): gym environment (CartPole-v0) agent (Agent): agent will train and get action replay_memory (ReplayMemory): trajectory is saved here eps (float): batch_size (int): batch size Returns: int: reward earned in this episode """ state = env.reset() done, total_reward = False, 0 while not done: a = agent.get_action(state, eps) state_2, reward, done, info = env.step(a) total_reward += reward if done: reward = -1 # Game lost, so terminal reward is -1 replay_memory.push(state, a, reward, state_2, done) if len(replay_memory) > batch_size: minibatch = replay_memory.pop(batch_size) train_helper(agent=agent, minibatch=minibatch, gamma=gamma) state = state_2 return total_reward
def rollout( env: gym.Env, nb_episodes: int = 1, max_steps: Optional[int] = None, policy=lambda env, state: _random_action(env, state), callback=lambda env, step: None, ): """ Do a rollout. :param env: the OpenAI Gym environment. :param nb_episodes: the number of rollout episodes. :param max_steps: maximum number of steps per episodes. :param policy: a callable that takes the enviornment and the state and returns the action. :param callback: a callback that takes the environment and it is called at each step. :return: None """ if max_steps: env = TimeLimit(env, max_episode_steps=max_steps) for _ in range(nb_episodes): state = env.reset() done = False callback(env, (state, 0.0, done, {})) while not done: action = policy(env, state) state, reward, done, info = env.step(action) callback(env, (state, reward, done, info))
def run_episode(environment: gym.Env, agent: DQNAgent, render: bool, max_length: int): """ Run one episode in the given environment with the agent. Arguments: environment {`gym.Env`} -- Environment representing the Markov Decision Process agent {`DQNAgent`} -- Reinforcment Learning agent that acts in the envíronment render {`bool`} -- Whether the frames of the episode should be rendered on the screen max_length {`int`} -- Maximum number of steps before the episode is terminated Returns: `float` -- Cumulated reward that the agent received during the episode """ episode_reward = 0 state = environment.reset() for _ in range(max_length): if render: environment.render() action = agent.act(state) next_state, reward, terminal, _ = environment.step(action) agent.observe( Transition(state, action, reward, None if terminal else next_state)) episode_reward += reward if terminal: break else: state = next_state return episode_reward
def selection_phase(node: SequentialNode, state: gym.Env, selection_strat: Callable[[SequentialNode, float], Dict[int, float]], exploration_factor: float) \ -> Tuple[SequentialNode, Union[int, None]]: ''' Traverses the tree starting at :param: node, following :param: selection_strat to decide which child (branch) to follow, updating the :param state environment model during traversal. This phase terminates when a child node (edge) is selected leading to an un-expanded child node. OR when a terminal node is found. :param node: Current node of the MCTS tree being traversed :param state: Environment model :param selection_strat: Selection policy used to select the most promising child node to traverse to :param exploration_factor: Exploration factor for :param: selection_strat :returns: A node and an action leading to an un-expanded child ''' if node.is_terminal: return node, None scores = selection_strat(node, exploration_factor) best_a_i = choice(extract_best_actions(scores)) if best_a_i not in node.children: return node, best_a_i else: state.step(best_a_i) return selection_phase(node.children[best_a_i], state, selection_strat, exploration_factor)
def train_episode(env: gym.Env, agent: ActorCriticAgent, memory: ReplayMemory, batch_size: int, gamma: float, tau: float) -> float: s = env.reset() done = False total_reward = 0 while not done: a = agent.perform_policy(s) if agent.a_space_type == "discrete": discrete_a = np.argmax(a) s2, r, done, info = env.step(discrete_a) else: s2, r, done, info = env.step(a) env.render() memory.push(s, a, r, done, s2) total_reward += r if len(memory) > batch_size: transition_batch = memory.sample(batch_size) agent.learning(transition_batch, gamma, tau) s = s2 if done: return total_reward
def _play( self, env: gym.Env, callbacks: Collection[Callback] = (), nb_episodes: int = 1000, experiment_name: str = "", is_training: bool = False, ): context = Context(experiment_name, self, env, callbacks) context.on_training_begin() for episode in range(nb_episodes): state = env.reset() done = False step = 0 context.on_episode_begin(episode) while not done: action = (self.take_action(state) if is_training else self.choose_best_action(state)) context.on_step_begin(step, action) state2, reward, done, info = env.step(action) observation = (state, action, reward, state2, done) if is_training: self.observe(observation) context.on_step_end(step, observation) state = state2 step += 1 context.on_episode_end(episode) context.on_training_end()
def test_agent_performance(env: gym.Env, sac_params: SacEntropyAdjustmentParams, run_params: RunParams, writer, test_episode_number: int, scaler: sklearn.preprocessing.StandardScaler): """ Tests the agent's performance by running the policy during a certain amount of episodes. The average episode reward and episode length are logged on the console and optionally on Tensorboard""" with torch.no_grad(): episode_rewards, episode_lengths = [], [] for j in range(sac_params.num_test_episodes): state, done, episode_reward, episode_length = env.reset( ), False, 0, 0 while not done: state_scaled = scale_state( scaler, state) if run_params.should_scale_states else state action = select_action_sac( state_scaled, sac_params, deterministic=True) # No noise, pure exploitation state, reward, done, _ = env.step(action) episode_reward += reward episode_length += 1 episode_rewards.append(episode_reward) episode_lengths.append(episode_length) print(f"\tAverage total episode reward: {np.mean(episode_rewards):.3f}" f"\tAverage episode length: {np.mean(episode_lengths):.3f}") if run_params.use_tensorboard: writer.add_scalar("Test Performance/Average Performance", np.mean(episode_rewards), test_episode_number) writer.add_scalar("Test Performance/Average Episode Steps", np.mean(episode_lengths), test_episode_number)
def test(env: gym.Env, agent: AgentBase, settings: TestSettings): # Initialize variables for logging. # agent.load(settings.directory) scores = ContiguousRingBuffer(capacity=128) eps = ConstantEpsilon(0.01) for i_episode in tqdm(range(settings.num_episodes)): # Initialize episode state = env.reset() total_reward = 0 # Interact with the environment until done. done = False step = 0 while not done: action = agent.select_action(state, eps(i_episode)) if settings.render: env.render() next_state, reward, done, _ = env.step(action) total_reward += reward state = next_state time.sleep(1.0 / settings.fps) logger.debug('{}:{}'.format(step, action)) step += 1 # Save the final score. scores.append(total_reward) return scores
def _check_returned_values(env: gym.Env, observation_space: spaces.Space, action_space: spaces.Space) -> None: """ Check the returned values by the env when calling `.reset()` or `.step()` methods. """ # because env inherits from gym.Env, we assume that `reset()` and `step()` methods exists obs = env.reset() if isinstance(observation_space, spaces.Dict): assert isinstance( obs, dict), "The observation returned by `reset()` must be a dictionary" for key in observation_space.spaces.keys(): try: _check_obs(obs[key], observation_space.spaces[key], "reset") except AssertionError as e: raise AssertionError(f"Error while checking key={key}: " + str(e)) else: _check_obs(obs, observation_space, "reset") # Sample a random action action = action_space.sample() data = env.step(action) assert ( len(data) == 4 ), "The `step()` method must return four values: obs, reward, done, info" # Unpack obs, reward, done, info = data if isinstance(observation_space, spaces.Dict): assert isinstance( obs, dict), "The observation returned by `step()` must be a dictionary" for key in observation_space.spaces.keys(): try: _check_obs(obs[key], observation_space.spaces[key], "step") except AssertionError as e: raise AssertionError(f"Error while checking key={key}: " + str(e)) else: _check_obs(obs, observation_space, "step") # We also allow int because the reward will be cast to float assert isinstance( reward, (float, int, np.float32)), "The reward returned by `step()` must be a float" assert isinstance(done, bool), "The `done` signal must be a boolean" assert isinstance( info, dict), "The `info` returned by `step()` must be a python dictionary" if isinstance(env, gym.GoalEnv): # For a GoalEnv, the keys are checked at reset assert reward == env.compute_reward(obs["achieved_goal"], obs["desired_goal"], info)
def _check_returned_values(env: gym.Env, observation_space: spaces.Space, action_space: spaces.Space) -> None: """ Check the returned values by the env when calling `.reset()` or `.step()` methods. """ # because env inherits from gym.Env, we assume that `reset()` and `step()` methods exists obs = env.reset() _check_obs(obs, observation_space, 'reset') # Sample a random action action = action_space.sample() data = env.step(action) assert len(data) == 4, "The `step()` method must return four values: obs, reward, done, info" # Unpack obs, reward, done, info = data _check_obs(obs, observation_space, 'step') # We also allow int because the reward will be cast to float assert isinstance(reward, (float, int)), "The reward returned by `step()` must be a float" assert isinstance(done, bool), "The `done` signal must be a boolean" assert isinstance(info, dict), "The `info` returned by `step()` must be a python dictionary" if isinstance(env, gym.GoalEnv): # For a GoalEnv, the keys are checked at reset assert reward == env.compute_reward(obs['achieved_goal'], obs['desired_goal'], info)
def iterate_batches(env: gym.Env, net: Net, batch_size: int): batch = [] episode_reward = 0.0 episode_steps = [] obs = env.reset() sm = nn.Softmax(dim=1) while True: # noinspection PyArgumentList obs_v = torch.FloatTensor([obs]) # Get the probability distribution, sample and execute an action. act_probs = sm(net(obs_v)).data.numpy()[0] action = np.random.choice(len(act_probs), p=act_probs) next_obs, reward, done, _ = env.step(action) # Collect metrics for applying the cross-entropy method. episode_reward += reward step = EpisodeStep(observation=obs, action=action) episode_steps.append(step) if done: episode = Episode(reward=episode_reward, steps=episode_steps) batch.append(episode) episode_reward = 0.0 episode_steps = [] next_obs = env.reset() if len(batch) == batch_size: yield batch batch = [] obs = next_obs
def get_trajectory( self, env: gym.Env, actor: nn.Module, device, sampler_index: int = None, trajectory_index: int = None, t_max: int = 1000, ) -> Trajectory: if sampler_index is not None: epsilon = float(pow(0.9996, trajectory_index + 1) / (sampler_index + 1)) else: epsilon = None state = env.reset() observations, actions, rewards, dones = [], [], [], [] for t in range(t_max): action = self.get_action(env, actor, state=state, epsilon=epsilon) next_state, reward, done, _ = env.step(action) observations.append(state) actions.append(action) rewards.append(reward) dones.append(done) state = next_state if done: break trajectory = Trajectory(observations, actions, rewards, dones) return trajectory
def fit(self, env: gym.Env, nb_steps: int) -> None: """ Train the agent on the given proxy environment. :param env: the gym environment in which the agent is trained :param nb_steps: number of training steps to be performed :return: None """ action_counter = 0 # for the BanditEnv example, the episode will always be 1. # In general that's not the case, but for completeness # we implemented a training loop that supports learning across many episodes. episode_counter = 0 nb_steps_digits = len(str(nb_steps)) while action_counter < nb_steps: env.reset() done = False episode_counter += 1 while not done and action_counter < nb_steps: action = self._pick_an_action() action_counter += 1 obs, reward, done, info = env.step(action) self._update_model(obs, reward, done, info, action) if action_counter % 10 == 0: print(("Step {:" + str(nb_steps_digits) + "}/{}, episode={}, action={:7}, reward={}").format( action_counter, nb_steps, episode_counter, str(action), reward, )) env.close()