Python EpisodeLogger示例

编程语言: Python

命名空间/包名称: rl_coach.logger

类/类型: EpisodeLogger

hotexamples.com的示例: 2

Python EpisodeLogger - 已找到2个示例。这些是从开源项目中提取的最受好评的rl_coach.logger.EpisodeLogger现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

EpisodeLogger(1)

create_signal_value(1)

dump_output_csv(1)

set_current_time(1)

set_episode_idx(1)

set_logger_filenames(1)

update_wall_clock_time(1)

示例#1

显示文件

文件： agent.py 项目： mdavala/coach

    def __init__(self,
                 agent_parameters: AgentParameters,
                 parent: Union['LevelManager', 'CompositeAgent'] = None):
        """
        :param agent_parameters: A Preset class instance with all the running paramaters
        """
        super().__init__()
        self.ap = agent_parameters
        self.task_id = self.ap.task_parameters.task_index
        self.is_chief = self.task_id == 0
        self.shared_memory = type(agent_parameters.task_parameters) == DistributedTaskParameters \
                             and self.ap.memory.shared_memory
        if self.shared_memory:
            self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
        self.name = agent_parameters.name
        self.parent = parent
        self.parent_level_manager = None
        self.full_name_id = agent_parameters.full_name_id = self.name

        if type(agent_parameters.task_parameters) == DistributedTaskParameters:
            screen.log_title(
                "Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
                "tensorflow wake up time)".format(self.full_name_id,
                                                  self.task_id))
        else:
            screen.log_title("Creating agent - name: {}".format(
                self.full_name_id))
        self.imitation = False
        self.agent_logger = Logger()
        self.agent_episode_logger = EpisodeLogger()

        # get the memory
        # - distributed training + shared memory:
        #   * is chief?  -> create the memory and add it to the scratchpad
        #   * not chief? -> wait for the chief to create the memory and then fetch it
        # - non distributed training / not shared memory:
        #   * create memory
        memory_name = self.ap.memory.path.split(':')[1]
        self.memory_lookup_name = self.full_name_id + '.' + memory_name
        if self.shared_memory and not self.is_chief:
            self.memory = self.shared_memory_scratchpad.get(
                self.memory_lookup_name)
        else:
            # modules
            if agent_parameters.memory.load_memory_from_file_path:
                screen.log_title(
                    "Loading replay buffer from pickle. Pickle path: {}".
                    format(agent_parameters.memory.load_memory_from_file_path))
                self.memory = read_pickle(
                    agent_parameters.memory.load_memory_from_file_path)
            else:
                self.memory = dynamic_import_and_instantiate_module_from_params(
                    self.ap.memory)

            if self.shared_memory and self.is_chief:
                self.shared_memory_scratchpad.add(self.memory_lookup_name,
                                                  self.memory)

        # set devices
        if type(agent_parameters.task_parameters) == DistributedTaskParameters:
            self.has_global = True
            self.replicated_device = agent_parameters.task_parameters.device
            self.worker_device = "/job:worker/task:{}".format(self.task_id)
        else:
            self.has_global = False
            self.replicated_device = None
            self.worker_device = ""
        if agent_parameters.task_parameters.use_cpu:
            self.worker_device += "/cpu:0"
        else:
            self.worker_device += "/device:GPU:0"

        # filters
        self.input_filter = self.ap.input_filter
        self.output_filter = self.ap.output_filter
        self.pre_network_filter = self.ap.pre_network_filter
        device = self.replicated_device if self.replicated_device else self.worker_device
        self.input_filter.set_device(device)
        self.output_filter.set_device(device)
        self.pre_network_filter.set_device(device)

        # initialize all internal variables
        self._phase = RunPhase.HEATUP
        self.total_shaped_reward_in_current_episode = 0
        self.total_reward_in_current_episode = 0
        self.total_steps_counter = 0
        self.running_reward = None
        self.training_iteration = 0
        self.last_target_network_update_step = 0
        self.last_training_phase_step = 0
        self.current_episode = self.ap.current_episode = 0
        self.curr_state = {}
        self.current_hrl_goal = None
        self.current_episode_steps_counter = 0
        self.episode_running_info = {}
        self.last_episode_evaluation_ran = 0
        self.running_observations = []
        self.agent_logger.set_current_time(self.current_episode)
        self.exploration_policy = None
        self.networks = {}
        self.last_action_info = None
        self.running_observation_stats = None
        self.running_reward_stats = None
        self.accumulated_rewards_across_evaluation_episodes = 0
        self.accumulated_shaped_rewards_across_evaluation_episodes = 0
        self.num_successes_across_evaluation_episodes = 0
        self.num_evaluation_episodes_completed = 0
        self.current_episode_buffer = Episode(
            discount=self.ap.algorithm.discount)
        # TODO: add agents observation rendering for debugging purposes (not the same as the environment rendering)

        # environment parameters
        self.spaces = None
        self.in_action_space = self.ap.algorithm.in_action_space

        # signals
        self.episode_signals = []
        self.step_signals = []
        self.loss = self.register_signal('Loss')
        self.curr_learning_rate = self.register_signal('Learning Rate')
        self.unclipped_grads = self.register_signal('Grads (unclipped)')
        self.reward = self.register_signal('Reward',
                                           dump_one_value_per_episode=False,
                                           dump_one_value_per_step=True)
        self.shaped_reward = self.register_signal(
            'Shaped Reward',
            dump_one_value_per_episode=False,
            dump_one_value_per_step=True)
        if isinstance(self.in_action_space, GoalsSpace):
            self.distance_from_goal = self.register_signal(
                'Distance From Goal', dump_one_value_per_step=True)

        # use seed
        if self.ap.task_parameters.seed is not None:
            random.seed(self.ap.task_parameters.seed)
            np.random.seed(self.ap.task_parameters.seed)

示例#2

显示文件

文件： agent.py 项目： mdavala/coach

class Agent(AgentInterface):
    def __init__(self,
                 agent_parameters: AgentParameters,
                 parent: Union['LevelManager', 'CompositeAgent'] = None):
        """
        :param agent_parameters: A Preset class instance with all the running paramaters
        """
        super().__init__()
        self.ap = agent_parameters
        self.task_id = self.ap.task_parameters.task_index
        self.is_chief = self.task_id == 0
        self.shared_memory = type(agent_parameters.task_parameters) == DistributedTaskParameters \
                             and self.ap.memory.shared_memory
        if self.shared_memory:
            self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
        self.name = agent_parameters.name
        self.parent = parent
        self.parent_level_manager = None
        self.full_name_id = agent_parameters.full_name_id = self.name

        if type(agent_parameters.task_parameters) == DistributedTaskParameters:
            screen.log_title(
                "Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
                "tensorflow wake up time)".format(self.full_name_id,
                                                  self.task_id))
        else:
            screen.log_title("Creating agent - name: {}".format(
                self.full_name_id))
        self.imitation = False
        self.agent_logger = Logger()
        self.agent_episode_logger = EpisodeLogger()

        # get the memory
        # - distributed training + shared memory:
        #   * is chief?  -> create the memory and add it to the scratchpad
        #   * not chief? -> wait for the chief to create the memory and then fetch it
        # - non distributed training / not shared memory:
        #   * create memory
        memory_name = self.ap.memory.path.split(':')[1]
        self.memory_lookup_name = self.full_name_id + '.' + memory_name
        if self.shared_memory and not self.is_chief:
            self.memory = self.shared_memory_scratchpad.get(
                self.memory_lookup_name)
        else:
            # modules
            if agent_parameters.memory.load_memory_from_file_path:
                screen.log_title(
                    "Loading replay buffer from pickle. Pickle path: {}".
                    format(agent_parameters.memory.load_memory_from_file_path))
                self.memory = read_pickle(
                    agent_parameters.memory.load_memory_from_file_path)
            else:
                self.memory = dynamic_import_and_instantiate_module_from_params(
                    self.ap.memory)

            if self.shared_memory and self.is_chief:
                self.shared_memory_scratchpad.add(self.memory_lookup_name,
                                                  self.memory)

        # set devices
        if type(agent_parameters.task_parameters) == DistributedTaskParameters:
            self.has_global = True
            self.replicated_device = agent_parameters.task_parameters.device
            self.worker_device = "/job:worker/task:{}".format(self.task_id)
        else:
            self.has_global = False
            self.replicated_device = None
            self.worker_device = ""
        if agent_parameters.task_parameters.use_cpu:
            self.worker_device += "/cpu:0"
        else:
            self.worker_device += "/device:GPU:0"

        # filters
        self.input_filter = self.ap.input_filter
        self.output_filter = self.ap.output_filter
        self.pre_network_filter = self.ap.pre_network_filter
        device = self.replicated_device if self.replicated_device else self.worker_device
        self.input_filter.set_device(device)
        self.output_filter.set_device(device)
        self.pre_network_filter.set_device(device)

        # initialize all internal variables
        self._phase = RunPhase.HEATUP
        self.total_shaped_reward_in_current_episode = 0
        self.total_reward_in_current_episode = 0
        self.total_steps_counter = 0
        self.running_reward = None
        self.training_iteration = 0
        self.last_target_network_update_step = 0
        self.last_training_phase_step = 0
        self.current_episode = self.ap.current_episode = 0
        self.curr_state = {}
        self.current_hrl_goal = None
        self.current_episode_steps_counter = 0
        self.episode_running_info = {}
        self.last_episode_evaluation_ran = 0
        self.running_observations = []
        self.agent_logger.set_current_time(self.current_episode)
        self.exploration_policy = None
        self.networks = {}
        self.last_action_info = None
        self.running_observation_stats = None
        self.running_reward_stats = None
        self.accumulated_rewards_across_evaluation_episodes = 0
        self.accumulated_shaped_rewards_across_evaluation_episodes = 0
        self.num_successes_across_evaluation_episodes = 0
        self.num_evaluation_episodes_completed = 0
        self.current_episode_buffer = Episode(
            discount=self.ap.algorithm.discount)
        # TODO: add agents observation rendering for debugging purposes (not the same as the environment rendering)

        # environment parameters
        self.spaces = None
        self.in_action_space = self.ap.algorithm.in_action_space

        # signals
        self.episode_signals = []
        self.step_signals = []
        self.loss = self.register_signal('Loss')
        self.curr_learning_rate = self.register_signal('Learning Rate')
        self.unclipped_grads = self.register_signal('Grads (unclipped)')
        self.reward = self.register_signal('Reward',
                                           dump_one_value_per_episode=False,
                                           dump_one_value_per_step=True)
        self.shaped_reward = self.register_signal(
            'Shaped Reward',
            dump_one_value_per_episode=False,
            dump_one_value_per_step=True)
        if isinstance(self.in_action_space, GoalsSpace):
            self.distance_from_goal = self.register_signal(
                'Distance From Goal', dump_one_value_per_step=True)

        # use seed
        if self.ap.task_parameters.seed is not None:
            random.seed(self.ap.task_parameters.seed)
            np.random.seed(self.ap.task_parameters.seed)

    @property
    def parent(self):
        """
        Get the parent class of the agent
        :return: the current phase
        """
        return self._parent

    @parent.setter
    def parent(self, val):
        """
        Change the parent class of the agent.
        Additionally, updates the full name of the agent
        :param val: the new parent
        :return: None
        """
        self._parent = val
        if self._parent is not None:
            if not hasattr(self._parent, 'name'):
                raise ValueError("The parent of an agent must have a name")
            self.full_name_id = self.ap.full_name_id = "{}/{}".format(
                self._parent.name, self.name)

    def setup_logger(self):
        # dump documentation
        logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
            format(graph_name=self.parent_level_manager.parent_graph_manager.name,
                   level_name=self.parent_level_manager.name,
                   agent_full_id='.'.join(self.full_name_id.split('/')))
        self.agent_logger.set_logger_filenames(
            self.ap.task_parameters.experiment_path,
            logger_prefix=logger_prefix,
            add_timestamp=True,
            task_id=self.task_id)
        if self.ap.visualization.dump_in_episode_signals:
            self.agent_episode_logger.set_logger_filenames(
                self.ap.task_parameters.experiment_path,
                logger_prefix=logger_prefix,
                add_timestamp=True,
                task_id=self.task_id)

    def set_session(self, sess) -> None:
        """
        Set the deep learning framework session for all the agents in the composite agent
        :return: None
        """
        self.input_filter.set_session(sess)
        self.output_filter.set_session(sess)
        self.pre_network_filter.set_session(sess)
        [network.set_session(sess) for network in self.networks.values()]

    def register_signal(self,
                        signal_name: str,
                        dump_one_value_per_episode: bool = True,
                        dump_one_value_per_step: bool = False) -> Signal:
        """
        Register a signal such that its statistics will be dumped and be viewable through dashboard
        :param signal_name: the name of the signal as it will appear in dashboard
        :param dump_one_value_per_episode: should the signal value be written for each episode?
        :param dump_one_value_per_step: should the signal value be written for each step?
        :return: the created signal
        """
        signal = Signal(signal_name)
        if dump_one_value_per_episode:
            self.episode_signals.append(signal)
        if dump_one_value_per_step:
            self.step_signals.append(signal)
        return signal

    def set_environment_parameters(self, spaces: SpacesDefinition):
        """
        Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
        dependent on those values, by calling init_environment_dependent_modules
        :param spaces: the environment spaces definition
        :return: None
        """
        self.spaces = copy.deepcopy(spaces)

        if self.ap.algorithm.use_accumulated_reward_as_measurement:
            if 'measurements' in self.spaces.state.sub_spaces:
                self.spaces.state['measurements'].shape += 1
                self.spaces.state['measurements'].measurements_names += [
                    'accumulated_reward'
                ]
            else:
                self.spaces.state['measurements'] = VectorObservationSpace(
                    1, measurements_names=['accumulated_reward'])

        for observation_name in self.spaces.state.sub_spaces.keys():
            self.spaces.state[observation_name] = \
                self.pre_network_filter.get_filtered_observation_space(observation_name,
                    self.input_filter.get_filtered_observation_space(observation_name,
                                                                     self.spaces.state[observation_name]))

        self.spaces.reward = self.pre_network_filter.get_filtered_reward_space(
            self.input_filter.get_filtered_reward_space(self.spaces.reward))

        self.spaces.action = self.output_filter.get_unfiltered_action_space(
            self.spaces.action)

        if isinstance(self.in_action_space, GoalsSpace):
            # TODO: what if the goal type is an embedding / embedding change?
            self.spaces.goal = self.in_action_space
            self.spaces.goal.set_target_space(
                self.spaces.state[self.spaces.goal.goal_name])

        self.init_environment_dependent_modules()

    def create_networks(self) -> Dict[str, NetworkWrapper]:
        """
        Create all the networks of the agent.
        The network creation will be done after setting the environment parameters for the agent, since they are needed
        for creating the network.
        :return: A list containing all the networks
        """
        networks = {}
        for network_name in sorted(self.ap.network_wrappers.keys()):
            networks[network_name] = NetworkWrapper(
                name=network_name,
                agent_parameters=self.ap,
                has_target=self.ap.network_wrappers[network_name].
                create_target_network,
                has_global=self.has_global,
                spaces=self.spaces,
                replicated_device=self.replicated_device,
                worker_device=self.worker_device)
        return networks

    def init_environment_dependent_modules(self) -> None:
        """
        Initialize any modules that depend on knowing information about the environment such as the action space or
        the observation space
        :return: None
        """
        # initialize exploration policy
        self.ap.exploration.action_space = self.spaces.action
        self.exploration_policy = dynamic_import_and_instantiate_module_from_params(
            self.ap.exploration)

        # create all the networks of the agent
        self.networks = self.create_networks()

    @property
    def phase(self) -> RunPhase:
        return self._phase

    @phase.setter
    def phase(self, val: RunPhase) -> None:
        """
        Change the phase of the run for the agent and all the sub components
        :param phase: the new run phase (TRAIN, TEST, etc.)
        :return: None
        """
        self.reset_evaluation_state(val)
        self._phase = val
        self.exploration_policy.change_phase(val)

    def reset_evaluation_state(self, val: RunPhase) -> None:
        starting_evaluation = (val == RunPhase.TEST)
        ending_evaluation = (self.phase == RunPhase.TEST)

        if starting_evaluation:
            self.accumulated_rewards_across_evaluation_episodes = 0
            self.accumulated_shaped_rewards_across_evaluation_episodes = 0
            self.num_successes_across_evaluation_episodes = 0
            self.num_evaluation_episodes_completed = 0
            if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
                screen.log_title("{}: Starting evaluation phase".format(
                    self.name))

        elif ending_evaluation:
            # we write to the next episode, because it could be that the current episode was already written
            # to disk and then we won't write it again
            self.agent_logger.set_current_time(self.current_episode + 1)
            self.agent_logger.create_signal_value(
                'Evaluation Reward',
                self.accumulated_rewards_across_evaluation_episodes /
                self.num_evaluation_episodes_completed)
            self.agent_logger.create_signal_value(
                'Shaped Evaluation Reward',
                self.accumulated_shaped_rewards_across_evaluation_episodes /
                self.num_evaluation_episodes_completed)
            success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
            self.agent_logger.create_signal_value("Success Rate", success_rate)
            if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
                screen.log_title(
                    "{}: Finished evaluation phase. Success rate = {}".format(
                        self.name, np.round(success_rate, 2)))

    def call_memory(self, func, args=()):
        """
        This function is a wrapper to allow having the same calls for shared or unshared memories.
        It should be used instead of calling the memory directly in order to allow different algorithms to work
        both with a shared and a local memory.
        :param func: the name of the memory function to call
        :param args: the arguments to supply to the function
        :return: the return value of the function
        """
        if self.shared_memory:
            result = self.shared_memory_scratchpad.internal_call(
                self.memory_lookup_name, func, args)
        else:
            if type(args) != tuple:
                args = (args, )
            result = getattr(self.memory, func)(*args)
        return result

    def log_to_screen(self):
        # log to screen
        log = OrderedDict()
        log["Name"] = self.full_name_id
        if self.task_id is not None:
            log["Worker"] = self.task_id
        log["Episode"] = self.current_episode
        log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
        log["Exploration"] = np.round(
            self.exploration_policy.get_control_param(), 2)
        log["Steps"] = self.total_steps_counter
        log["Training iteration"] = self.training_iteration
        screen.log_dict(log, prefix=self.phase.value)

    def update_step_in_episode_log(self):
        """
        Writes logging messages to screen and updates the log file with all the signal values.
        :return: None
        """
        # log all the signals to file
        self.agent_episode_logger.set_current_time(
            self.current_episode_steps_counter)
        self.agent_episode_logger.create_signal_value('Training Iter',
                                                      self.training_iteration)
        self.agent_episode_logger.create_signal_value(
            'In Heatup', int(self._phase == RunPhase.HEATUP))
        self.agent_episode_logger.create_signal_value(
            'ER #Transitions', self.call_memory('num_transitions'))
        self.agent_episode_logger.create_signal_value(
            'ER #Episodes', self.call_memory('length'))
        self.agent_episode_logger.create_signal_value('Total steps',
                                                      self.total_steps_counter)
        self.agent_episode_logger.create_signal_value(
            "Epsilon", self.exploration_policy.get_control_param())
        self.agent_episode_logger.create_signal_value(
            "Shaped Accumulated Reward",
            self.total_shaped_reward_in_current_episode)
        self.agent_episode_logger.create_signal_value('Update Target Network',
                                                      0,
                                                      overwrite=False)
        self.agent_episode_logger.update_wall_clock_time(
            self.current_episode_steps_counter)

        for signal in self.step_signals:
            self.agent_episode_logger.create_signal_value(
                signal.name, signal.get_last_value())

        # dump
        self.agent_episode_logger.dump_output_csv()

    def update_log(self):
        """
        Writes logging messages to screen and updates the log file with all the signal values.
        :return: None
        """
        # log all the signals to file
        self.agent_logger.set_current_time(self.current_episode)
        self.agent_logger.create_signal_value('Training Iter',
                                              self.training_iteration)
        self.agent_logger.create_signal_value(
            'In Heatup', int(self._phase == RunPhase.HEATUP))
        self.agent_logger.create_signal_value(
            'ER #Transitions', self.call_memory('num_transitions'))
        self.agent_logger.create_signal_value('ER #Episodes',
                                              self.call_memory('length'))
        self.agent_logger.create_signal_value(
            'Episode Length', self.current_episode_steps_counter)
        self.agent_logger.create_signal_value('Total steps',
                                              self.total_steps_counter)
        self.agent_logger.create_signal_value(
            "Epsilon", np.mean(self.exploration_policy.get_control_param()))
        self.agent_logger.create_signal_value(
            "Shaped Training Reward",
            self.total_shaped_reward_in_current_episode
            if self._phase == RunPhase.TRAIN else np.nan)
        self.agent_logger.create_signal_value(
            "Training Reward", self.total_reward_in_current_episode
            if self._phase == RunPhase.TRAIN else np.nan)

        self.agent_logger.create_signal_value('Update Target Network',
                                              0,
                                              overwrite=False)
        self.agent_logger.update_wall_clock_time(self.current_episode)

        if self._phase != RunPhase.TEST:
            self.agent_logger.create_signal_value('Evaluation Reward',
                                                  np.nan,
                                                  overwrite=False)
            self.agent_logger.create_signal_value('Shaped Evaluation Reward',
                                                  np.nan,
                                                  overwrite=False)
            self.agent_logger.create_signal_value('Success Rate',
                                                  np.nan,
                                                  overwrite=False)

        for signal in self.episode_signals:
            self.agent_logger.create_signal_value(
                "{}/Mean".format(signal.name), signal.get_mean())
            self.agent_logger.create_signal_value(
                "{}/Stdev".format(signal.name), signal.get_stdev())
            self.agent_logger.create_signal_value("{}/Max".format(signal.name),
                                                  signal.get_max())
            self.agent_logger.create_signal_value("{}/Min".format(signal.name),
                                                  signal.get_min())

        # dump
        if self.current_episode % self.ap.visualization.dump_signals_to_csv_every_x_episodes == 0 \
                and self.current_episode > 0:
            self.agent_logger.dump_output_csv()

    def handle_episode_ended(self) -> None:
        """
        End an episode
        :return: None
        """
        self.current_episode_buffer.is_complete = True

        if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
            self.current_episode += 1

        if self.phase != RunPhase.TEST and isinstance(
                self.memory, EpisodicExperienceReplay):
            self.call_memory('store_episode', self.current_episode_buffer)

        if self.phase == RunPhase.TEST:
            self.accumulated_rewards_across_evaluation_episodes += self.total_reward_in_current_episode
            self.accumulated_shaped_rewards_across_evaluation_episodes += self.total_shaped_reward_in_current_episode
            self.num_evaluation_episodes_completed += 1

            if self.spaces.reward.reward_success_threshold and \
                    self.total_reward_in_current_episode >= self.spaces.reward.reward_success_threshold:
                self.num_successes_across_evaluation_episodes += 1

        if self.ap.visualization.dump_csv:
            self.update_log()

        if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
            self.log_to_screen()

    def reset_internal_state(self):
        """
        Reset all the episodic parameters
        :return: None
        """
        for signal in self.episode_signals:
            signal.reset()
        for signal in self.step_signals:
            signal.reset()
        self.agent_episode_logger.set_episode_idx(self.current_episode)
        self.total_shaped_reward_in_current_episode = 0
        self.total_reward_in_current_episode = 0
        self.curr_state = {}
        self.current_episode_steps_counter = 0
        self.episode_running_info = {}
        self.current_episode_buffer = Episode(
            discount=self.ap.algorithm.discount)
        if self.exploration_policy:
            self.exploration_policy.reset()
        self.input_filter.reset()
        self.output_filter.reset()
        self.pre_network_filter.reset()
        if isinstance(self.memory, EpisodicExperienceReplay):
            self.call_memory('verify_last_episode_is_closed')

        for network in self.networks.values():
            network.online_network.reset_internal_memory()

    def learn_from_batch(self, batch) -> Tuple[float, List, List]:
        """
        Given a batch of transitions, calculates their target values and updates the network.
        :param batch: A list of transitions
        :return: The total loss of the training, the loss per head and the unclipped gradients
        """
        return 0, [], []

    def _should_update_online_weights_to_target(self):
        """
        Determine if online weights should be copied to the target.
        :return: boolean: True if the online weights should be copied to the target.
        """
        # update the target network of every network that has a target network
        step_method = self.ap.algorithm.num_steps_between_copying_online_weights_to_target
        if step_method.__class__ == TrainingSteps:
            should_update = (
                self.training_iteration -
                self.last_target_network_update_step) >= step_method.num_steps
            if should_update:
                self.last_target_network_update_step = self.training_iteration
        elif step_method.__class__ == EnvironmentSteps:
            should_update = (
                self.total_steps_counter -
                self.last_target_network_update_step) >= step_method.num_steps
            if should_update:
                self.last_target_network_update_step = self.total_steps_counter
        else:
            raise ValueError(
                "The num_steps_between_copying_online_weights_to_target parameter should be either "
                "EnvironmentSteps or TrainingSteps. Instead it is {}".format(
                    step_method.__class__))
        return should_update

    def _should_train(self, wait_for_full_episode=False):
        """
        Determine if we should start a training phase according to the number of steps passed since the last training
        :return:  boolean: True if we should start a training phase
        """
        step_method = self.ap.algorithm.num_consecutive_playing_steps
        if step_method.__class__ == EnvironmentEpisodes:
            should_update = (
                self.current_episode -
                self.last_training_phase_step) >= step_method.num_steps
            if should_update:
                self.last_training_phase_step = self.current_episode
        elif step_method.__class__ == EnvironmentSteps:
            should_update = (
                self.total_steps_counter -
                self.last_training_phase_step) >= step_method.num_steps
            if wait_for_full_episode:
                should_update = should_update and self.current_episode_steps_counter == 0
            if should_update:
                self.last_training_phase_step = self.total_steps_counter
        else:
            raise ValueError(
                "The num_consecutive_playing_steps parameter should be either "
                "EnvironmentSteps or Episodes. Instead it is {}".format(
                    step_method.__class__))
        return should_update

    def train(self):
        """
        Check if a training phase should be done as configured by num_consecutive_playing_steps.
        If it should, then do several training steps as configured by num_consecutive_training_steps.
        A single training iteration: Sample a batch, train on it and update target networks.
        :return: The total training loss during the training iterations.
        """
        loss = 0
        if self._should_train():
            for training_step in range(
                    self.ap.algorithm.num_consecutive_training_steps):
                # TODO: this should be network dependent
                network_parameters = list(self.ap.network_wrappers.values())[0]

                # update counters
                self.training_iteration += 1

                # sample a batch and train on it
                batch = self.call_memory('sample',
                                         network_parameters.batch_size)
                if self.pre_network_filter is not None:
                    batch = self.pre_network_filter.filter(
                        batch, update_internal_state=False, deep_copy=False)

                # if the batch returned empty then there are not enough samples in the replay buffer -> skip
                # training step
                if len(batch) > 0:
                    # train
                    batch = Batch(batch)
                    total_loss, losses, unclipped_grads = self.learn_from_batch(
                        batch)
                    loss += total_loss
                    self.unclipped_grads.add_sample(unclipped_grads)

                    # TODO: the learning rate decay should be done through the network instead of here
                    # decay learning rate
                    if network_parameters.learning_rate_decay_rate != 0:
                        self.curr_learning_rate.add_sample(
                            self.networks['main'].sess.run(
                                self.networks['main'].online_network.
                                current_learning_rate))
                    else:
                        self.curr_learning_rate.add_sample(
                            network_parameters.learning_rate)

                    if any([network.has_target for network in self.networks.values()]) \
                            and self._should_update_online_weights_to_target():
                        for network in self.networks.values():
                            network.update_target_network(
                                self.ap.algorithm.
                                rate_for_copying_weights_to_target)

                        self.agent_logger.create_signal_value(
                            'Update Target Network', 1)
                    else:
                        self.agent_logger.create_signal_value(
                            'Update Target Network', 0, overwrite=False)

                    self.loss.add_sample(loss)

                    if self.imitation:
                        self.log_to_screen()

            # run additional commands after the training is done
            self.post_training_commands()

        return loss

    def choose_action(self, curr_state):
        """
        choose an action to act with in the current episode being played. Different behavior might be exhibited when training
         or testing.

        :param curr_state: the current state to act upon.
        :return: chosen action, some action value describing the action (q-value, probability, etc)
        """
        pass

    def prepare_batch_for_inference(self,
                                    states: Union[Dict[str, np.ndarray],
                                                  List[Dict[str, np.ndarray]]],
                                    network_name: str):
        """
        convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
        observations together, measurements together, etc.
        """
        # convert to batch so we can run it through the network
        states = force_list(states)
        batches_dict = {}
        for key in self.ap.network_wrappers[
                network_name].input_embedders_parameters.keys():
            # there are cases (e.g. ddpg) where the state does not contain all the information needed for running
            # through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
            # addition to the current_state, so that all the inputs of the network will be filled)
            if key in states[0].keys():
                batches_dict[key] = np.array(
                    [np.array(state[key]) for state in states])

        return batches_dict

    def act(self) -> ActionInfo:
        """
        Given the agents current knowledge, decide on the next action to apply to the environment
        :return: an action and a dictionary containing any additional info from the action decision process
        """
        if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
            # This agent never plays  while training (e.g. behavioral cloning)
            return None

        # count steps (only when training or if we are in the evaluation worker)
        if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
            self.total_steps_counter += 1
        self.current_episode_steps_counter += 1

        # decide on the action
        if self.phase == RunPhase.HEATUP and not self.ap.algorithm.heatup_using_network_decisions:
            # random action
            self.last_action_info = self.spaces.action.sample_with_info()
        else:
            # informed action
            if self.pre_network_filter is not None:
                # before choosing an action, first use the pre_network_filter to filter out the current state
                curr_state = self.run_pre_network_filter_for_inference(
                    self.curr_state)

            else:
                curr_state = self.curr_state
            self.last_action_info = self.choose_action(curr_state)

        filtered_action_info = self.output_filter.filter(self.last_action_info)

        return filtered_action_info

    def run_pre_network_filter_for_inference(self, state: StateType):
        dummy_env_response = EnvResponse(next_state=state,
                                         reward=0,
                                         game_over=False)
        return self.pre_network_filter.filter(dummy_env_response)[0].next_state

    def get_state_embedding(self, state: dict) -> np.ndarray:
        """
        Given a state, get the corresponding state embedding  from the main network
        :param state: a state dict
        :return: a numpy embedding vector
        """
        # TODO: this won't work anymore
        # TODO: instead of the state embedding (which contains the goal) we should use the observation embedding
        embedding = self.networks['main'].online_network.predict(
            self.prepare_batch_for_inference(state, "main"),
            outputs=self.networks['main'].online_network.state_embedding)
        return embedding

    def update_transition_before_adding_to_replay_buffer(
            self, transition: Transition) -> Transition:
        """
        Allows agents to update the transition just before adding it to the replay buffer.
        Can be useful for agents that want to tweak the reward, termination signal, etc.
        :param transition: the transition to update
        :return: the updated transition
        """
        return transition

    def observe(self, env_response: EnvResponse) -> bool:
        """
        Given a response from the environment, distill the observation from it and store it for later use.
        The response should be a dictionary containing the performed action, the new observation and measurements,
        the reward, a game over flag and any additional information necessary.
        :param env_response: result of call from environment.step(action)
        :return:
        """

        # filter the env_response
        filtered_env_response = self.input_filter.filter(env_response)[0]

        # inject agent collected statistics, if required
        if self.ap.algorithm.use_accumulated_reward_as_measurement:
            if 'measurements' in filtered_env_response.next_state:
                filtered_env_response.next_state['measurements'] = np.append(
                    filtered_env_response.next_state['measurements'],
                    self.total_shaped_reward_in_current_episode)
            else:
                filtered_env_response.next_state['measurements'] = np.array(
                    [self.total_shaped_reward_in_current_episode])

        # if we are in the first step in the episode, then we don't have a a next state and a reward and thus no
        # transition yet, and therefore we don't need to store anything in the memory.
        # also we did not reach the goal yet.
        if self.current_episode_steps_counter == 0:
            # initialize the current state
            self.curr_state = filtered_env_response.next_state
            return env_response.game_over
        else:
            transition = Transition(
                state=copy.copy(self.curr_state),
                action=self.last_action_info.action,
                reward=filtered_env_response.reward,
                next_state=filtered_env_response.next_state,
                game_over=filtered_env_response.game_over,
                info=filtered_env_response.info)

            # now that we have formed a basic transition - the next state progresses to be the current state
            self.curr_state = filtered_env_response.next_state

            # make agent specific changes to the transition if needed
            transition = self.update_transition_before_adding_to_replay_buffer(
                transition)

            # merge the intrinsic reward in
            if self.ap.algorithm.scale_external_reward_by_intrinsic_reward_value:
                transition.reward = transition.reward * (
                    1 + self.last_action_info.action_intrinsic_reward)
            else:
                transition.reward = transition.reward + self.last_action_info.action_intrinsic_reward

            # sum up the total shaped reward
            self.total_shaped_reward_in_current_episode += transition.reward
            self.total_reward_in_current_episode += env_response.reward
            self.shaped_reward.add_sample(transition.reward)
            self.reward.add_sample(env_response.reward)

            # add action info to transition
            if type(self.parent).__name__ == 'CompositeAgent':
                transition.add_info(self.parent.last_action_info.__dict__)
            else:
                transition.add_info(self.last_action_info.__dict__)

            # create and store the transition
            if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
                # for episodic memories we keep the transitions in a local buffer until the episode is ended.
                # for regular memories we insert the transitions directly to the memory
                if isinstance(self.memory, EpisodicExperienceReplay):
                    self.current_episode_buffer.insert(transition)
                else:
                    self.call_memory('store', transition)

            if self.ap.visualization.dump_in_episode_signals:
                self.update_step_in_episode_log()

            return transition.game_over

    def post_training_commands(self):
        pass

    def get_predictions(self, states: List[Dict[str, np.ndarray]],
                        prediction_type: PredictionType):
        """
        Get a prediction from the agent with regard to the requested prediction_type.
        If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
        raise a ValueException.
        :param states:
        :param prediction_type:
        :return:
        """

        predictions = self.networks[
            'main'].online_network.predict_with_prediction_type(
                # states=self.dict_state_to_batches_dict(states, 'main'), prediction_type=prediction_type)
                states=states,
                prediction_type=prediction_type)

        if len(predictions.keys()) != 1:
            raise ValueError(
                "The network has more than one component {} matching the requested prediction_type {}. "
                .format(list(predictions.keys()), prediction_type))
        return list(predictions.values())[0]

    def set_incoming_directive(self, action: ActionType) -> None:
        if isinstance(self.in_action_space, GoalsSpace):
            self.current_hrl_goal = action
        elif isinstance(self.in_action_space, AttentionActionSpace):
            self.input_filter.observation_filters[
                'attention'].crop_low = action[0]
            self.input_filter.observation_filters[
                'attention'].crop_high = action[1]
            self.output_filter.action_filters['masking'].set_masking(
                action[0], action[1])

    def save_checkpoint(self, checkpoint_id: int) -> None:
        """
        Allows agents to store additional information when saving checkpoints.
        :param checkpoint_id: the id of the checkpoint
        :return: None
        """
        pass

    def sync(self) -> None:
        """
        Sync the global network parameters to local networks
        :return: None
        """
        for network in self.networks.values():
            network.sync()