def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): print("episode {} (env-idx={}) started.".format( episode.episode_id, env_index)) episode.user_data["pole_angles"] = [] episode.hist_data["pole_angles"] = []
def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs): episode.hist_data["actions"] = [] # Action counters per episode for i in range(worker.env.nA): episode.user_data["actions/action_" + str(i)] = []
def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs): self.player_scores.append(episode.last_info_for(f'agent_0_high')) for i in range(1, 4): self.opponent_scores.append( episode.last_info_for(f'agent_{i}_high'))
def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None: snapshot = tracemalloc.take_snapshot() top_stats = snapshot.statistics("lineno") for stat in top_stats[:10]: count = stat.count size = stat.size trace = str(stat.traceback) episode.custom_metrics[f"tracemalloc/{trace}/size"] = size episode.custom_metrics[f"tracemalloc/{trace}/count"] = count process = psutil.Process(os.getpid()) worker_rss = process.memory_info().rss worker_data = process.memory_info().data worker_vms = process.memory_info().vms episode.custom_metrics["tracemalloc/worker/rss"] = worker_rss episode.custom_metrics["tracemalloc/worker/data"] = worker_data episode.custom_metrics["tracemalloc/worker/vms"] = worker_vms
def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): episode.user_data["velocity"] = [] episode.user_data["steering"] = [] episode.user_data["step_reward"] = [] episode.user_data["acceleration"] = []
def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): episode.user_data['op'] = {} for op in METRIC_OPS: episode.user_data['op'][op] = defaultdict(list)
def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv, episode: MultiAgentEpisode, **kwargs): """ pole_angle = abs(episode.last_observation_for()[2]) raw_angle = abs(episode.last_raw_obs_for()[2]) assert pole_angle == raw_angle episode.user_data["pole_angles"].append(pole_angle) """ prefix = "agt_" for i in range(num_agents): obs = episode.last_raw_obs_for(i) #obs = episode.last_observation_for(i) act = episode.last_action_for(i) reward = episode.last_info_for(i).get("reward") NAV = episode.last_info_for(i).get("NAV") NAV = None if NAV is None else float(NAV) num_trades = episode.last_info_for(i).get("num_trades") if reward is None: # goto next agent. continue episode.user_data[prefix + str(i) + "_obs"].append(obs) episode.user_data[prefix + str(i) + "_act"].append(act) episode.user_data[prefix + str(i) + "_reward"].append(reward) episode.user_data[prefix + str(i) + "_NAV"].append(NAV) episode.user_data[prefix + str(i) + "_num_trades"].append(num_trades)
def on_episode_end( self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs, ): ego_speed = episode.user_data["ego_speed"] mean_ego_speed = { _id: np.mean(speed_hist) for _id, speed_hist in ego_speed.items() } distance_travelled = { _id: np.mean(info["score"]) for _id, info in episode._agent_to_last_info.items() } speed_list = list(map(lambda x: round(x, 3), mean_ego_speed.values())) dist_list = list(map(lambda x: round(x, 3), distance_travelled.values())) reward_list = list(map(lambda x: round(x, 3), episode.agent_rewards.values())) for _id, speed in mean_ego_speed.items(): episode.custom_metrics[f"mean_ego_speed_{_id}"] = speed for _id, distance in distance_travelled.items(): episode.custom_metrics[f"distance_travelled_{_id}"] = distance print( f"episode {episode.episode_id} ended with {episode.length} steps: [mean_speed]: {speed_list} [distance_travelled]: {dist_list} [reward]: {reward_list}" )
def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv, episode: MultiAgentEpisode, **kwargs): if episode.last_info_for() is not None: profit = episode.last_info_for()['profit'] episode.user_data["profit"].append(profit) episode.user_data["actions"][episode.last_action_for()] += 1
def on_episode_end( self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs, ): ego_speed = episode.user_data["ego_speed"] mean_ego_speed = { agent_id: np.mean(speed_hist) for agent_id, speed_hist in ego_speed.items() } distance_travelled = dict() for _id, info in episode._agent_to_last_info.items(): if info.get("_group_info"): for i, _info in enumerate(info["_group_info"]): distance_travelled[f"{_id}:AGENT-{i}"] = np.mean(_info["score"]) else: distance_travelled[_id] = np.mean(info["score"]) speed_list = list(map(lambda x: round(x, 3), mean_ego_speed.values())) dist_list = list(map(lambda x: round(x, 3), distance_travelled.values())) reward_list = list(map(lambda x: round(x, 3), episode.agent_rewards.values())) episode.custom_metrics[f"mean_ego_speed"] = sum(speed_list) / max( 1, len(speed_list) ) episode.custom_metrics[f"distance_travelled"] = sum(dist_list) / max( 1, len(dist_list) ) logger.info(f"episode {episode.episode_id} ended with {episode.length} steps")
def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs): pole_angle = np.mean(episode.user_data["pole_angles"]) print("episode {} ended with length {} and pole angles {}".format( episode.episode_id, episode.length, pole_angle)) episode.custom_metrics["pole_angle"] = pole_angle episode.hist_data["pole_angles"] = episode.user_data["pole_angles"]
def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs): # print("episode {} started".format(episode.episode_id)) episode.user_data["reward_score"] = [] episode.user_data["reward_target_bias"] = [] episode.user_data["reward_ap"] = [] episode.user_data["ep_target_bias"] = [] episode.user_data["num_no_action"] = 0
def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs): episode.custom_metrics['nr_ac_crashes'] = sum( value == -1 for value in episode.agent_rewards.values()) episode.custom_metrics['nr_ac_landed'] = sum( value == 1 for value in episode.agent_rewards.values()) episode.custom_metrics['nr_ac_out_of_bounds'] = sum( value == -0.5 for value in episode.agent_rewards.values())
def on_episode_step(self, *, worker: RolloutWorker, base_env: BaseEnv, episode: MultiAgentEpisode, env_index: int, **kwargs): # Make sure this episode is ongoing. assert episode.length > 0, \ "ERROR: `on_episode_step()` callback should not be called right " \ "after env reset!" pole_angle = abs(episode.last_observation_for()[2]) raw_angle = abs(episode.last_raw_obs_for()[2]) assert pole_angle == raw_angle episode.user_data["pole_angles"].append(pole_angle)
def on_episode_step( self, episode: MultiAgentEpisode = None, step_data: dict = None): if not self._log_current_full_episode: return None assert episode is not None or step_data is not None assert episode is None or step_data is None if step_data is None: step_data = {} for agent_id, policy in episode._policies.items(): if self._first_fake_step_done: if agent_id in self._log_full_epi_tmp_data.keys(): obs_before_act = self._log_full_epi_tmp_data[agent_id] else: obs_before_act = None action = episode.last_action_for(agent_id).tolist() epi = episode.episode_id rewards = episode._agent_reward_history[agent_id] reward = rewards[-1] if len(rewards) > 0 else None info = episode.last_info_for(agent_id) if hasattr(policy, "to_log"): info.update(policy.to_log) else: logger.info(f"policy {policy} doesn't have attrib " "to_log. hasattr(policy, 'to_log'): " f"{hasattr(policy, 'to_log')}") # Episode provide the last action with the given last # observation produced by this action. But we need the # observation that cause the agent to play this action # thus the observation n-1 obs_after_act = episode.last_observation_for(agent_id) self._log_full_epi_tmp_data[agent_id] = obs_after_act if self._first_fake_step_done: if self.log_ful_epi_one_hot_obs: obs_before_act = np.argwhere(obs_before_act) obs_after_act = np.argwhere(obs_after_act) step_data[agent_id] = { "obs_before_act": obs_before_act, "obs_after_act": obs_after_act, "action": action, "reward": reward, "info": info, "epi": epi} if self._first_fake_step_done: self.json_logger.write_json(step_data) self.json_logger.write("\n") self.step_counter += 1 else: logger.info("FullEpisodeLogger: don't log first fake step") self._first_fake_step_done = True
def on_episode_start( self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs, ): print("episode {} started".format(episode.episode_id)) episode.user_data["ego_speed"] = dict() episode.user_data["step_heading_error"] = dict()
def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): # Make sure this episode has just been started (only initial obs # logged so far). assert episode.length == 0, \ "ERROR: `on_episode_start()` callback should be called right " \ "after env reset!" print("episode {} (env-idx={}) started.".format( episode.episode_id, env_index)) episode.user_data["pole_angles"] = [] episode.hist_data["pole_angles"] = []
def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs): # https://docs.ray.io/en/master/rllib-package-ref.html?highlight=MultiAgentEpisode#ray.rllib.evaluation.MultiAgentEpisode if worker.env.mode == 'test': # Test episode ended, save metrics # I want this value (or a mean of several of them) to be used as metric for the checkpoints episode.custom_metrics["test_return"] = episode.agent_rewards[( 'agent0', 'default_policy')] for i in range(worker.env.nA): episode.custom_metrics["actions/action_" + str(i)] = sum( episode.user_data["actions/action_" + str(i)])
def on_episode_step(self, *, worker, base_env: BaseEnv, episode: MultiAgentEpisode, env_index: int, **kwargs): info = self.get_info(base_env, episode) # add all custom scalar metrics in the info dict if info is not None and 'scalar_metrics' in info: for metric_name, metric_value in info['scalar_metrics'].items(): episode.custom_metrics[metric_name] = metric_value # increment (or init) the sum over all time steps inside the episode eps_metric_name = f'eps_{metric_name}' if eps_metric_name in episode.user_data: episode.user_data[eps_metric_name] += metric_value else: episode.user_data[eps_metric_name] = metric_value
def on_episode_end(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): # Make sure this episode is really done. assert episode.batch_builder.policy_collectors[ "default_policy"].buffers["dones"][-1], \ "ERROR: `on_episode_end()` should only be called " \ "after episode is done!" pole_angle = np.mean(episode.user_data["pole_angles"]) print("episode {} (env-idx={}) ended with length {} and pole " "angles {}".format(episode.episode_id, env_index, episode.length, pole_angle)) episode.custom_metrics["pole_angle"] = pole_angle episode.hist_data["pole_angles"] = episode.user_data["pole_angles"]
def on_episode_end(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): # Make sure this episode is really done. assert episode.batch_builder.policy_collectors[ "default_policy"].buffers["dones"][-1], \ "ERROR: `on_episode_end()` should only be called " \ "after episode is done!" print( "episode {} (env-idx={}) ended with length {} , agent rewards {} and total reward {}" .format(episode.episode_id, env_index, episode.length, episode.agent_rewards, episode.total_reward)) episode.custom_metrics["agent_rewards"] = episode.agent_rewards episode.custom_metrics["total_reward"] = episode.total_reward
def on_episode_end( # type: ignore self, *_, episode: MultiAgentEpisode, **__) -> None: key = list(episode._agent_to_last_info.keys())[0] ep_info = episode.last_info_for(key).copy() episode.custom_metrics.update(ray.tune.utils.flatten_dict(ep_info))
def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs): episode.custom_metrics = { 'episode_average_departure': [], 'episode_average_arrival': [], 'episode_average_wait': [], 'episode_missing_agents': [], 'episode_on_time_agents': [], 'episode_total_reward': [], } episode.hist_data = { 'info_by_agent': [], 'rewards_by_agent': [], 'last_action_by_agent': [], }
def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv, episode: MultiAgentEpisode, **kwargs): for agent_name in range(4): action = episode.last_action_for(agent_name) if action == constants.Action.Bomb.value: episode.custom_metrics["agent_{}/num_bombs".format( agent_name)] += 1
def on_postprocess_trajectory(self, *, worker: "RolloutWorker", episode: MultiAgentEpisode, agent_id: AgentID, policy_id: PolicyID, policies: Dict[PolicyID, Policy], postprocessed_batch: SampleBatch, original_batches: Dict[AgentID, SampleBatch], **kwargs) -> None: if args.store_network_data: network_data_list = [] for i in range(len(postprocessed_batch['obs'])): network_data_step = {} for key in postprocessed_batch: network_key = None if key == 'actions': network_key = 'action' elif key == 'obs': network_key = 'observation' elif key in [ 'action_prob', 'action_logp', 'action_dist_inputs', 'vf_preds', 'fc_1', 'fc_2', 'fc_value_1', 'fc_value_2', 'logits' ]: network_key = key if network_key is not None: network_data_step[ network_key] = postprocessed_batch[key][i] network_data_list.append(network_data_step) episode.user_data['network_data_list'] = network_data_list
def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, **kwargs): ensemble_rewards = episode.last_info_for()["ensemble_rewards"] episode.custom_metrics[f"max_reward"].append( np.max(ensemble_rewards)) for i, ri in enumerate(ensemble_rewards): episode.custom_metrics[f"reward_{i}"].append(ri)
def on_postprocess_trajectory( self, worker: RolloutWorker, episode: MultiAgentEpisode, agent_id: str, policy_id: str, policies: Dict[str, Policy], postprocessed_batch: SampleBatch, original_batches: Dict[str, SampleBatch], **kwargs): if "num_batches" not in episode.custom_metrics: episode.custom_metrics["num_batches"] = 0 episode.custom_metrics["num_batches"] += 1
def on_episode_step(self, *, worker: RolloutWorker, base_env: BaseEnv, episode: MultiAgentEpisode, env_index: int, **kwargs): episode_info = episode.last_info_for() if episode_info: for op in list(episode_info.keys() & METRIC_OPS): for k, v in episode_info[op].items(): episode.user_data['op'][op][k].append(v)
def on_episode_step(self, *, worker: RolloutWorker, base_env: BaseEnv, episode: MultiAgentEpisode, env_index: int, **kwargs): info = episode.last_info_for() if info is not None: episode.user_data["velocity"].append(info["velocity"]) episode.user_data["steering"].append(info["steering"]) episode.user_data["step_reward"].append(info["step_reward"]) episode.user_data["acceleration"].append(info["acceleration"])
def on_episode_step(self, *, worker: RolloutWorker, base_env: BaseEnv, episode: MultiAgentEpisode, env_index: int, **kwargs): if type(episode.last_info_for()) == dict: # extract the timestep of the current step from the dict timestep = episode.last_info_for()['timestep'] episode.user_data["timestep"].append(timestep) # extract the number of conolidated from the info dict num_consolidated = episode.last_info_for()['num_consolidated'] episode.user_data["num_consolidated"].append(num_consolidated) # extract the number of overloaded from the info dict num_overloaded = episode.last_info_for()['num_overloaded'] episode.user_data["num_overloaded"].append(num_overloaded) # extract of the greedy_num_consolidated from the dict num_moves = episode.last_info_for()['num_moves'] episode.user_data["num_moves"].append(num_moves) # extract of the greedy_num_consolidated from the dict greedy_num_consolidated = episode.last_info_for( )['greedy_num_consolidated'] episode.user_data["greedy_num_consolidated"].append( greedy_num_consolidated) # rewards rewards = episode.last_info_for()['rewards'] episode.user_data["rewards"].append(rewards)