def perform_dqn_logging(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() episode_rewards_len = len(episode_rewards) if episode_rewards_len > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if episode_rewards_len > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t, )) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) print(f'episode len: {episode_rewards_len}') if self.start_time is not None: time_since_start = (time.time() - self.start_time) print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start sys.stdout.flush() for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, self.agent.t) print('Done logging...\n\n') self.logger.flush()
def perform_dqn_logging(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t,)) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) if self.start_time is not None: time_since_start = (time.time() - self.start_time) print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start if self.params['use_wandb'] == 1: wandb.log(logs)
def perform_dqn_logging(self, all_logs): last_log = all_logs[-1] episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() print("0: ", len(episode_rewards)) #added if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) print("1: ", self.mean_episode_reward) # added if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) print("2: ", self.mean_episode_reward) # added logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t, )) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) if self.start_time is not None: time_since_start = (time.time() - self.start_time) print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start logs.update(last_log) eval_paths, eval_envsteps_this_batch = utils.sample_trajectories( self.eval_env, self.agent.eval_policy, self.params['eval_batch_size'], self.params['ep_len']) eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] logs["Eval_AverageReturn"] = np.mean(eval_returns) logs["Eval_StdReturn"] = np.std(eval_returns) logs["Eval_MaxReturn"] = np.max(eval_returns) logs["Eval_MinReturn"] = np.min(eval_returns) logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) logs['Buffer size'] = self.agent.replay_buffer.num_in_buffer sys.stdout.flush() for key, value in logs.items(): print('{} : {}'.format(key, value)) self.logger.log_scalar(value, key, self.agent.t) print('Done logging...\n\n') self.logger.flush()
def perform_dqn_logging(self, all_logs): last_log = all_logs[-1] episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) logs = OrderedDict() logs["Train_EnvstepsSoFar"] = self.agent.t print("Timestep %d" % (self.agent.t, )) if self.mean_episode_reward > -5000: logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) print("mean reward (100 episodes) %f" % self.mean_episode_reward) if self.best_mean_episode_reward > -5000: logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) if self.start_time is not None: time_since_start = time.time() - self.start_time print("running time %f" % time_since_start) logs["TimeSinceStart"] = time_since_start logs.update(last_log) sys.stdout.flush() for key, value in logs.items(): print("{} : {}".format(key, value)) self.logger.log_scalar(value, key, self.agent.t) print("Done logging...\n\n") self.logger.flush()