Python ContainerSplitter примеры использования

Язык программирования: Python

Пространство имен/Пакет: rlgraph.components

Класс/Тип: ContainerSplitter

Примеров на hotexamples.com: 10

Python ContainerSplitter - 10 примеров найдено. Это лучшие примеры Python кода для rlgraph.components.ContainerSplitter, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ContainerSplitter(10)

Основные методы

ContainerSplitter (10)

Пример #1

Показать файл

Файл: ppo_agent.py Проект: mugenZebra/rlgraph

    def __init__(self, clip_ratio, memory_spec=None, **kwargs):
        """
        Args:
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the PPO algorithm.
        """
        super(PPOAgent, self).__init__(name=kwargs.pop("name", "ppo-agent"),
                                       **kwargs)

        self.train_time_steps = 0

        # PPO uses a ring buffer.
        self.memory = Memory.from_spec(memory_spec)
        self.record_space = Dict(states=self.state_space,
                                 actions=self.action_space,
                                 rewards=float,
                                 terminals=BoolBox(),
                                 add_batch_rank=False)

        self.policy = Policy(network_spec=self.neural_network,
                             action_adapter_spec=None)

        self.merger = DictMerger(output_space=self.record_space)
        splitter_input_space = copy.deepcopy(self.record_space)
        self.splitter = ContainerSplitter(input_space=splitter_input_space)
        self.loss_function = PPOLossFunction(clip_ratio=clip_ratio,
                                             discount=self.discount)

        self.define_graph_api()
        if self.auto_build:
            self._build_graph()
            self.graph_built = True

Пример #2

Показать файл

    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="actor-critic-agent",
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 sample_episodes=False,
                 weight_entropy=None,
                 memory_spec=None):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance
                of ValueFunction.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            sample_episodes (bool): If true, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
            a ring-buffer.
        """
        # Set policy to stochastic.
        if policy_spec is not None:
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(ActorCriticAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            value_function_spec=value_function_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            value_function_optimizer_spec=value_function_optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            name=name,
            auto_build=auto_build)
        self.sample_episodes = sample_episodes

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:{}".format(self.policy.scope),
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True)))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(self.memory, RingBuffer), \
            "ERROR: Actor-critic memory must be ring-buffer for episode-handling."
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")

        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = ActorCriticLossFunction(
            weight_entropy=weight_entropy)

        # Add all our sub-components to the core.
        sub_components = [
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.gae_function
        ]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"],
                              build_options=self.build_options)

            self.graph_built = True

Пример #3

Показать файл

    def __init__(self, double_q=True, dueling_q=True, huber_loss=False, n_step=1, memory_spec=None,
                 store_last_memory_batch=False, store_last_q_table=False, **kwargs):
        """
        Args:
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        action_adapter_spec = kwargs.pop("action_adapter_spec", dict())
        # Use a DuelingActionAdapter (instead of a basic ActionAdapter) if option is set.
        if dueling_q is True:
            action_adapter_spec["type"] = "dueling-action-adapter"
            assert "units_state_value_stream" in action_adapter_spec
            assert "units_advantage_stream" in action_adapter_spec
        super(DQNAgent, self).__init__(
            action_adapter_spec=action_adapter_spec, name=kwargs.pop("name", "dqn-agent"), **kwargs
        )

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            weights="variables:policy",
            time_step=int,
            use_exploration=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            next_states=preprocessed_state_space,
            preprocessed_next_states=preprocessed_state_space,
            importance_weights=weight_space,
            # TODO: This is currently necessary for multi-GPU handling (as the update_from_external_batch
            # TODO: gets overridden by a generic function with args=*inputs)
            #inputs=[preprocessed_state_space, self.action_space.with_batch_rank(), reward_space, terminal_space,
            #        preprocessed_state_space, weight_space]
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = DictMerger("states", "actions", "rewards", "next_states", "terminals")
        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states")

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy", trainable=False)
        self.target_policy.add_components(Synchronizable(), expose_apis="sync")
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQNLossFunction(
            discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss,
            importance_weights=use_importance_weights, n_step=n_step
        )

        # Add all our sub-components to the core.
        sub_components = [self.preprocessor, self.merger, self.memory, self.splitter, self.policy,
                          self.target_policy, self.exploration, self.loss_function, self.optimizer]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api("policy", "preprocessor-stack", self.optimizer.scope, *sub_components)

        # markup = get_graph_markup(self.graph_builder.root_component)
        # print(markup)
        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True

Пример #4

Показать файл

Файл: actor_critic_agent.py Проект: MegaYEye/rlgraph

    def __init__(self,
                 gae_lambda=1.0,
                 sample_episodes=False,
                 weight_entropy=None,
                 memory_spec=None,
                 **kwargs):
        """
        Args:
            gae_lambda (float): Lambda for generalized advantage estimation.
            sample_episodes (bool): If true, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
            a ring-buffer.
        """
        super(ActorCriticAgent, self).__init__(
            policy_spec=dict(deterministic=False),  # Set policy to stochastic.
            name=kwargs.pop("name", "actor-critic-agent"),
            **kwargs)
        self.sample_episodes = sample_episodes

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:{}".format(self.policy.scope),
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True)))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = DictMerger("states", "actions", "rewards", "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(self.memory, RingBuffer),\
            "ERROR: Actor-critic memory must be ring-buffer for episode-handling."
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")

        self.loss_function = ActorCriticLossFunction(
            discount=self.discount,
            gae_lambda=gae_lambda,
            weight_entropy=weight_entropy)

        # Add all our sub-components to the core.
        sub_components = [
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer
        ]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph(
                [self.root_component],
                self.input_spaces,
                optimizer=self.optimizer,
                batch_size=self.update_spec["batch_size"],
                build_options=dict(vf_optimizer=self.value_function_optimizer))

            self.graph_built = True

Пример #5

Показать файл

Файл: dqfd_agent.py Проект: lazyfunctor/rlgraph

    def __init__(self, expert_margin=0.5, supervised_weight=1.0, double_q=True, dueling_q=True,
                 huber_loss=False, n_step=1, shared_container_action_target=True,
                 memory_spec=None, demo_memory_spec=None,
                 demo_sample_ratio=0.2, store_last_memory_batch=False, store_last_q_table=False, **kwargs):
        # TODO Most of this is DQN duplicate but the way the loss function is instantiated, inheriting
        # from DQN does not work well.
        """
        Args:
            expert_margin (float): The expert margin enforces a distance in Q-values between expert action and
                all other actions.
            supervised_weight (float): Indicates weight of the expert loss.
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use.
            demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        policy_spec = kwargs.pop("policy_spec", dict())
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128
        super(DQFDAgent, self).__init__(
            policy_spec=policy_spec, name=kwargs.pop("name", "dqfd-agent"), **kwargs
        )
        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])
            )

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.demo_batch_size = int(demo_sample_ratio * self.update_spec['batch_size'] / (1.0 - demo_sample_ratio))
        self.shared_container_action_target = shared_container_action_target

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            policy_weights="variables:{}".format(self.policy.scope),
            time_step=int,
            use_exploration=bool,
            demo_batch_size=int,
            apply_demo_loss=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            next_states=preprocessed_state_space,
            preprocessed_next_states=preprocessed_state_space,
            importance_weights=weight_space
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals")

        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # Cannot have same default name.
        demo_memory_spec["scope"] = "demo-memory"
        self.demo_memory = Memory.from_spec(demo_memory_spec)

        # The splitter for splitting up the records from the memories.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states")

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy", trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQFDLossFunction(
            expert_margin=expert_margin, supervised_weight=supervised_weight,
            discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=use_importance_weights, n_step=n_step
        )

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy,
            self.target_policy, self.exploration, self.loss_function, self.optimizer
        )

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True

Пример #6

Показать файл

    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 exploration_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="agent"):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.

            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.

            discount (float): The discount factor (gamma).

            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.

            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.

            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            value_function_spec (list): Neural network specification for baseline.

            exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.

            value_function_optimizer_spec (dict): Optimizer config for value function otpimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).

            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.

            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.

            name (str): Some name for this Agent object.
        """
        super(Agent, self).__init__()

        self.name = name
        self.auto_build = auto_build
        self.graph_built = False
        self.logger = logging.getLogger(__name__)

        self.state_space = Space.from_spec(state_space).with_batch_rank(False)
        self.flat_state_space = self.state_space.flatten() if isinstance(
            self.state_space, ContainerSpace) else None
        self.logger.info("Parsed state space definition: {}".format(
            self.state_space))
        self.action_space = Space.from_spec(action_space).with_batch_rank(
            False)
        self.flat_action_space = self.action_space.flatten() if isinstance(
            self.action_space, ContainerSpace) else None
        self.logger.info("Parsed action space definition: {}".format(
            self.action_space))

        self.discount = discount
        self.build_options = {}

        # The agent's root-Component.
        self.root_component = Component(name=self.name, nesting_level=0)

        # Define the input-Spaces:
        # Tag the input-Space to `self.set_weights` as equal to whatever the variables-Space will be for
        # the Agent's policy Component.
        self.input_spaces = dict(states=self.state_space.with_batch_rank(), )

        # Construct the Preprocessor.
        self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec)
        self.preprocessed_state_space = self.preprocessor.get_preprocessed_space(
            self.state_space)
        self.preprocessing_required = preprocessing_spec is not None and len(
            preprocessing_spec) > 0
        if self.preprocessing_required:
            self.logger.info("Preprocessing required.")
            self.logger.info(
                "Parsed preprocessed-state space definition: {}".format(
                    self.preprocessed_state_space))
        else:
            self.logger.info("No preprocessing required.")

        # Construct the Policy network.
        policy_spec = policy_spec or dict()
        if "network_spec" not in policy_spec:
            policy_spec["network_spec"] = network_spec
        if "action_space" not in policy_spec:
            policy_spec["action_space"] = self.action_space
        self.policy_spec = policy_spec
        # The behavioral policy of the algorithm. Also the one that gets updated.
        self.policy = Policy.from_spec(self.policy_spec)
        # Done by default.
        self.policy.add_components(Synchronizable(), expose_apis="sync")

        # Create non-shared baseline network.
        self.value_function = None
        if value_function_spec is not None:
            self.value_function = ValueFunction(
                network_spec=value_function_spec)
            self.value_function.add_components(Synchronizable(),
                                               expose_apis="sync")
            self.vars_merger = ContainerMerger("policy",
                                               "vf",
                                               scope="variable-dict-merger")
            self.vars_splitter = ContainerSplitter(
                "policy", "vf", scope="variable-container-splitter")
        else:
            self.vars_merger = ContainerMerger("policy",
                                               scope="variable-dict-merger")
            self.vars_splitter = ContainerSplitter(
                "policy", scope="variable-container-splitter")

        self.internal_states_space = Space.from_spec(internal_states_space)

        # An object implementing the loss function interface is only strictly needed
        # if automatic device strategies like multi-gpu are enabled. This is because
        # the device strategy needs to know the name of the loss function to infer the appropriate
        # operations.
        self.loss_function = None

        self.exploration = Exploration.from_spec(exploration_spec)
        self.execution_spec = parse_execution_spec(execution_spec)

        # Python-side experience buffer for better performance (may be disabled).
        self.default_env = "env_0"

        def factory_(i):
            if i < 2:
                return []
            return tuple([[] for _ in range(i)])

        self.states_buffer = defaultdict(
            list)  # partial(fact_, len(self.flat_state_space)))
        self.actions_buffer = defaultdict(
            partial(factory_, len(self.flat_action_space or [])))
        self.internals_buffer = defaultdict(list)
        self.rewards_buffer = defaultdict(list)
        self.next_states_buffer = defaultdict(
            list)  # partial(fact_, len(self.flat_state_space)))
        self.terminals_buffer = defaultdict(list)

        self.observe_spec = parse_observe_spec(observe_spec)

        # Global time step counter.
        self.timesteps = 0

        # Create the Agent's optimizer based on optimizer_spec and execution strategy.
        self.optimizer = None
        if optimizer_spec is not None:
            # Save spec in case agent needs to create more optimizers e.g. for baseline.
            self.optimizer_spec = optimizer_spec
            self.optimizer = Optimizer.from_spec(optimizer_spec)

        self.value_function_optimizer = None
        if self.value_function is not None:
            if value_function_optimizer_spec is None:
                vf_optimizer_spec = self.optimizer_spec
            else:
                vf_optimizer_spec = value_function_optimizer_spec
            vf_optimizer_spec["scope"] = "value-function-optimizer"
            self.value_function_optimizer = Optimizer.from_spec(
                vf_optimizer_spec)

        # Update-spec dict tells the Agent how to update (e.g. memory batch size).
        self.update_spec = parse_update_spec(update_spec)

        # Create our GraphBuilder and -Executor.
        self.graph_builder = GraphBuilder(action_space=self.action_space,
                                          summary_spec=summary_spec)
        self.graph_executor = GraphExecutor.from_spec(
            get_backend(),
            graph_builder=self.graph_builder,
            execution_spec=self.execution_spec,
            saver_spec=saver_spec)  # type: GraphExecutor

Пример #7

Показать файл

    def __init__(self,
                 double_q=True,
                 dueling_q=True,
                 huber_loss=False,
                 n_step=1,
                 shared_container_action_target=True,
                 memory_spec=None,
                 store_last_memory_batch=False,
                 store_last_q_table=False,
                 **kwargs):
        """
        Args:
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        policy_spec = kwargs.pop("policy_spec", dict())
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128

        super(DQNAgent, self).__init__(policy_spec=policy_spec,
                                       name=kwargs.pop("name", "dqn-agent"),
                                       **kwargs)

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        #self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"],
                               self.update_spec["update_interval"]))

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.shared_container_action_target = shared_container_action_target

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(
                actions=self.action_space.with_batch_rank(),
                # weights will have a Space derived from the vars of policy.
                policy_weights="variables:{}".format(self.policy.scope),
                time_step=int,
                use_exploration=bool,
                preprocessed_states=preprocessed_state_space,
                rewards=reward_space,
                terminals=terminal_space,
                next_states=preprocessed_state_space,
                preprocessed_next_states=preprocessed_state_space,
                importance_weights=weight_space,
            ))
        if self.value_function is not None:
            self.input_spaces[
                "value_function_weights"] = "variables:{}".format(
                    self.value_function.scope),

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = DictMerger("states", "actions", "rewards", "next_states",
                                 "terminals")
        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals", "next_states")

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity,\
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\
            format(self.observe_spec["buffer_size"], self.memory.capacity)

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy",
                                              trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQNLossFunction(
            discount=self.discount,
            double_q=self.double_q,
            huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=use_importance_weights,
            n_step=n_step)

        self.root_component.add_components(
            self.preprocessor,
            self.merger,
            self.memory,
            self.splitter,
            self.policy,
            self.target_policy,
            self.value_function,
            self.value_function_optimizer,  # <- should both be None for DQN
            self.exploration,
            self.loss_function,
            self.optimizer,
            self.vars_merger,
            self.vars_splitter)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        # markup = get_graph_markup(self.graph_builder.root_component)
        # print(markup)
        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True

Пример #8

Показать файл

    def __init__(
        self,
        state_space,
        action_space,
        discount=0.98,
        preprocessing_spec=None,
        network_spec=None,
        internal_states_space=None,
        policy_spec=None,
        exploration_spec=None,
        execution_spec=None,
        optimizer_spec=None,
        observe_spec=None,
        update_spec=None,
        summary_spec=None,
        saver_spec=None,
        auto_build=True,
        name="dqfd-agent",
        expert_margin=0.5,
        supervised_weight=1.0,
        double_q=True,
        dueling_q=True,
        huber_loss=False,
        n_step=1,
        shared_container_action_target=False,
        memory_spec=None,
        demo_memory_spec=None,
        demo_sample_ratio=0.2,
    ):

        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            expert_margin (float): The expert margin enforces a distance in Q-values between expert action and
                all other actions.
            supervised_weight (float): Indicates weight of the expert loss.
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use.
            demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use.
        """
        # Fix action-adapter before passing it to the super constructor.
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            if policy_spec is None:
                policy_spec = {}
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128
        super(DQFDAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            exploration_spec=exploration_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            auto_build=auto_build,
            name=name
        )
        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])
            )

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.expert_margin = expert_margin

        self.batch_size = self.update_spec["batch_size"]
        self.default_margins = np.asarray([self.expert_margin] * self.batch_size)

        self.demo_batch_size = int(demo_sample_ratio * self.update_spec["batch_size"] / (1.0 - demo_sample_ratio))
        self.demo_margins = np.asarray([self.expert_margin] * self.demo_batch_size)
        self.shared_container_action_target = shared_container_action_target

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            policy_weights="variables:{}".format(self.policy.scope),
            time_step=int,
            use_exploration=bool,
            demo_batch_size=int,
            apply_demo_loss=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            expert_margins=FloatBox(add_batch_rank=True),
            next_states=preprocessed_state_space,
            preprocessed_next_states=preprocessed_state_space,
            importance_weights=weight_space
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals")

        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # Cannot have same default name.
        demo_memory_spec["scope"] = "demo-memory"
        self.demo_memory = Memory.from_spec(demo_memory_spec)

        # The splitter for splitting up the records from the memories.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states")

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy", trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        self.use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQFDLossFunction(
            supervised_weight=supervised_weight,
            discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=self.use_importance_weights, n_step=n_step
        )

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy,
            self.target_policy, self.exploration, self.loss_function, self.optimizer
        )

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True

Пример #9

Показать файл

Файл: ppo_agent.py Проект: theSoenke/rlgraph

    def __init__(self,
                 clip_ratio=0.2,
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 standardize_advantages=False,
                 sample_episodes=True,
                 weight_entropy=None,
                 memory_spec=None,
                 **kwargs):
        """
        Args:
            clip_ratio (float): Clipping parameter for likelihood ratio.
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            standardize_advantages (bool): If true, standardize advantage values in update.

            sample_episodes (bool): If True, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.

            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).

            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
                a ring-buffer.
        """
        if "policy_spec" in kwargs:
            policy_spec = kwargs.pop("policy_spec")
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(PPOAgent, self).__init__(
            policy_spec=policy_spec,  # Set policy to stochastic.
            name=kwargs.pop("name", "ppo-agent"),
            **kwargs)
        self.sample_episodes = sample_episodes

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:policy",
                 value_function_weights="variables:value-function",
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True),
                 apply_postprocessing=bool))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(
            self.memory, RingBuffer
        ), "ERROR: PPO memory must be ring-buffer for episode-handling!"

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity, \
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \
                format(self.observe_spec["buffer_size"], self.memory.capacity)

        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")
        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = PPOLossFunction(
            clip_ratio=clip_ratio,
            standardize_advantages=standardize_advantages,
            weight_entropy=weight_entropy)

        self.iterations = self.update_spec["num_iterations"]
        self.sample_size = self.update_spec["sample_size"]
        self.batch_size = self.update_spec["batch_size"]

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.exploration, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.vars_merger, self.vars_splitter, self.gae_function)
        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph(
                [self.root_component],
                self.input_spaces,
                optimizer=self.optimizer,
                # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get
                # multi-gpu-split.
                batch_size=self.update_spec["sample_size"],
                build_options=self.build_options)
            self.graph_built = True

Пример #10

Показать файл

Файл: dqn_agent.py Проект: EmpereurCC/RLgraph_exp

    def __init__(
        self,
        state_space,
        action_space,
        discount=0.98,
        preprocessing_spec=None,
        network_spec=None,
        internal_states_space=None,
        policy_spec=None,
        exploration_spec=None,
        execution_spec=None,
        optimizer_spec=None,
        observe_spec=None,
        update_spec=None,
        summary_spec=None,
        saver_spec=None,
        auto_build=True,
        name="dqn-agent",
        double_q=True,
        dueling_q=True,
        huber_loss=False,
        n_step=1,
        shared_container_action_target=True,
        memory_spec=None,
    ):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm.
        """
        # Fix action-adapter before passing it to the super constructor.
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128

        super(DQNAgent,
              self).__init__(state_space=state_space,
                             action_space=action_space,
                             discount=discount,
                             preprocessing_spec=preprocessing_spec,
                             network_spec=network_spec,
                             internal_states_space=internal_states_space,
                             policy_spec=policy_spec,
                             exploration_spec=exploration_spec,
                             execution_spec=execution_spec,
                             optimizer_spec=optimizer_spec,
                             observe_spec=observe_spec,
                             update_spec=update_spec,
                             summary_spec=summary_spec,
                             saver_spec=saver_spec,
                             auto_build=auto_build,
                             name=name)

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"],
                               self.update_spec["update_interval"]))

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.shared_container_action_target = shared_container_action_target

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(
                actions=self.action_space.with_batch_rank(),
                # Weights will have a Space derived from the vars of policy.
                policy_weights="variables:{}".format(self.policy.scope),
                use_exploration=bool,
                preprocessed_states=preprocessed_state_space,
                rewards=reward_space,
                terminals=terminal_space,
                next_states=preprocessed_state_space,
                preprocessed_next_states=preprocessed_state_space,
                importance_weights=weight_space,
                apply_postprocessing=bool))
        if self.value_function is not None:
            self.input_spaces[
                "value_function_weights"] = "variables:{}".format(
                    self.value_function.scope),

        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals", "next_states")

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity,\
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\
            format(self.observe_spec["buffer_size"], self.memory.capacity)

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy",
                                              trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQNLossFunction(
            discount=self.discount,
            double_q=self.double_q,
            huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=use_importance_weights,
            n_step=n_step)

        self.root_component.add_components(
            self.preprocessor,
            self.memory,
            self.splitter,
            self.policy,
            self.target_policy,
            self.value_function,
            self.value_function_optimizer,  # <- should both be None for DQN
            self.exploration,
            self.loss_function,
            self.optimizer,
            self.vars_merger,
            self.vars_splitter)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        # markup = get_graph_markup(self.graph_builder.root_component)
        # print(markup)
        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True