def update_log(self, phase=RunPhase.TRAIN): """ Writes logging messages to screen and updates the log file with all the signal values. :return: None """ # log all the signals to file logger.set_current_time(self.current_episode) logger.create_signal_value('Training Iter', self.training_iteration) logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP)) logger.create_signal_value('ER #Transitions', self.memory.num_transitions()) logger.create_signal_value('ER #Episodes', self.memory.length()) logger.create_signal_value('Episode Length', self.current_episode_steps_counter) logger.create_signal_value('Total steps', self.total_steps_counter) logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param()) logger.create_signal_value("Training Reward", self.total_reward_in_current_episode if phase == RunPhase.TRAIN else np.nan) logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode if phase == RunPhase.TEST else np.nan) logger.create_signal_value('Update Target Network', 0, overwrite=False) logger.update_wall_clock_time(self.current_episode) for signal in self.signals: logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean()) logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev()) logger.create_signal_value("{}/Max".format(signal.name), signal.get_max()) logger.create_signal_value("{}/Min".format(signal.name), signal.get_min()) # dump if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \ and self.current_episode > 0: logger.dump_output_csv()
def __init__(self, env, tuning_parameters, replicated_device=None, task_id=0): """ :param env: An environment instance :type env: EnvironmentWrapper :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset :param replicated_device: A tensorflow device for distributed training (optional) :type replicated_device: instancemethod :param thread_id: The current thread id :param thread_id: int """ screen.log_title("Creating agent {}".format(task_id)) self.task_id = task_id self.sess = tuning_parameters.sess self.env = tuning_parameters.env_instance = env self.imitation = False # i/o dimensions if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height: tuning_parameters.env.desired_observation_width = self.env.width tuning_parameters.env.desired_observation_height = self.env.height self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size if tuning_parameters.agent.use_accumulated_reward_as_measurement: self.measurements_size = tuning_parameters.env.measurements_size = ( self.measurements_size[0] + 1, ) # modules if tuning_parameters.agent.load_memory_from_file_path: screen.log_title( "Loading replay buffer from pickle. Pickle path: {}".format( tuning_parameters.agent.load_memory_from_file_path)) self.memory = read_pickle( tuning_parameters.agent.load_memory_from_file_path) else: self.memory = eval(tuning_parameters.memory + '(tuning_parameters)') # self.architecture = eval(tuning_parameters.architecture) self.has_global = replicated_device is not None self.replicated_device = replicated_device self.worker_device = "/job:worker/task:{}/cpu:0".format( task_id) if replicated_device is not None else "/gpu:0" self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)') self.evaluation_exploration_policy = eval( tuning_parameters.exploration.evaluation_policy + '(tuning_parameters)') self.evaluation_exploration_policy.change_phase(RunPhase.TEST) # initialize all internal variables self.tp = tuning_parameters self.in_heatup = False self.total_reward_in_current_episode = 0 self.total_steps_counter = 0 self.running_reward = None self.training_iteration = 0 self.current_episode = self.tp.current_episode = 0 self.curr_state = {} self.current_episode_steps_counter = 0 self.episode_running_info = {} self.last_episode_evaluation_ran = 0 self.running_observations = [] logger.set_current_time(self.current_episode) self.main_network = None self.networks = [] self.last_episode_images = [] self.renderer = Renderer() # signals self.signals = [] self.loss = Signal('Loss') self.signals.append(self.loss) self.curr_learning_rate = Signal('Learning Rate') self.signals.append(self.curr_learning_rate) if self.tp.env.normalize_observation and not self.env.is_state_type_image: if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: self.running_observation_stats = RunningStat( (self.tp.env.desired_observation_width, )) self.running_reward_stats = RunningStat(()) else: self.running_observation_stats = SharedRunningStats( self.tp, replicated_device, shape=(self.tp.env.desired_observation_width, ), name='observation_stats') self.running_reward_stats = SharedRunningStats( self.tp, replicated_device, shape=(), name='reward_stats') # env is already reset at this point. Otherwise we're getting an error where you cannot # reset an env which is not done self.reset_game(do_not_reset_env=True) # use seed if self.tp.seed is not None: random.seed(self.tp.seed) np.random.seed(self.tp.seed)