Пример #1
0
    def update_log(self, phase=RunPhase.TRAIN):
        """
        Writes logging messages to screen and updates the log file with all the signal values.
        :return: None
        """
        # log all the signals to file
        logger.set_current_time(self.current_episode)
        logger.create_signal_value('Training Iter', self.training_iteration)
        logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP))
        logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
        logger.create_signal_value('ER #Episodes', self.memory.length())
        logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
        logger.create_signal_value('Total steps', self.total_steps_counter)
        logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
        logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
                                   if phase == RunPhase.TRAIN else np.nan)
        logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode
                                   if phase == RunPhase.TEST else np.nan)
        logger.create_signal_value('Update Target Network', 0, overwrite=False)
        logger.update_wall_clock_time(self.current_episode)

        for signal in self.signals:
            logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
            logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
            logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
            logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())

        # dump
        if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \
                and self.current_episode > 0:
            logger.dump_output_csv()
Пример #2
0
    def __init__(self,
                 env,
                 tuning_parameters,
                 replicated_device=None,
                 task_id=0):
        """
        :param env: An environment instance
        :type env: EnvironmentWrapper
        :param tuning_parameters: A Preset class instance with all the running paramaters
        :type tuning_parameters: Preset
        :param replicated_device: A tensorflow device for distributed training (optional)
        :type replicated_device: instancemethod
        :param thread_id: The current thread id
        :param thread_id: int
        """

        screen.log_title("Creating agent {}".format(task_id))
        self.task_id = task_id
        self.sess = tuning_parameters.sess
        self.env = tuning_parameters.env_instance = env
        self.imitation = False

        # i/o dimensions
        if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height:
            tuning_parameters.env.desired_observation_width = self.env.width
            tuning_parameters.env.desired_observation_height = self.env.height
        self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size
        self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size
        if tuning_parameters.agent.use_accumulated_reward_as_measurement:
            self.measurements_size = tuning_parameters.env.measurements_size = (
                self.measurements_size[0] + 1, )

        # modules
        if tuning_parameters.agent.load_memory_from_file_path:
            screen.log_title(
                "Loading replay buffer from pickle. Pickle path: {}".format(
                    tuning_parameters.agent.load_memory_from_file_path))
            self.memory = read_pickle(
                tuning_parameters.agent.load_memory_from_file_path)
        else:
            self.memory = eval(tuning_parameters.memory +
                               '(tuning_parameters)')
        # self.architecture = eval(tuning_parameters.architecture)

        self.has_global = replicated_device is not None
        self.replicated_device = replicated_device
        self.worker_device = "/job:worker/task:{}/cpu:0".format(
            task_id) if replicated_device is not None else "/gpu:0"

        self.exploration_policy = eval(tuning_parameters.exploration.policy +
                                       '(tuning_parameters)')
        self.evaluation_exploration_policy = eval(
            tuning_parameters.exploration.evaluation_policy +
            '(tuning_parameters)')
        self.evaluation_exploration_policy.change_phase(RunPhase.TEST)

        # initialize all internal variables
        self.tp = tuning_parameters
        self.in_heatup = False
        self.total_reward_in_current_episode = 0
        self.total_steps_counter = 0
        self.running_reward = None
        self.training_iteration = 0
        self.current_episode = self.tp.current_episode = 0
        self.curr_state = {}
        self.current_episode_steps_counter = 0
        self.episode_running_info = {}
        self.last_episode_evaluation_ran = 0
        self.running_observations = []
        logger.set_current_time(self.current_episode)
        self.main_network = None
        self.networks = []
        self.last_episode_images = []
        self.renderer = Renderer()

        # signals
        self.signals = []
        self.loss = Signal('Loss')
        self.signals.append(self.loss)
        self.curr_learning_rate = Signal('Learning Rate')
        self.signals.append(self.curr_learning_rate)

        if self.tp.env.normalize_observation and not self.env.is_state_type_image:
            if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
                self.running_observation_stats = RunningStat(
                    (self.tp.env.desired_observation_width, ))
                self.running_reward_stats = RunningStat(())
            else:
                self.running_observation_stats = SharedRunningStats(
                    self.tp,
                    replicated_device,
                    shape=(self.tp.env.desired_observation_width, ),
                    name='observation_stats')
                self.running_reward_stats = SharedRunningStats(
                    self.tp, replicated_device, shape=(), name='reward_stats')

        # env is already reset at this point. Otherwise we're getting an error where you cannot
        # reset an env which is not done
        self.reset_game(do_not_reset_env=True)

        # use seed
        if self.tp.seed is not None:
            random.seed(self.tp.seed)
            np.random.seed(self.tp.seed)