Python ContainerMerger примеры использования

Язык программирования: Python

Пространство имен/Пакет: rlgraph.components

Класс/Тип: ContainerMerger

Примеров на hotexamples.com: 11

Python ContainerMerger - 11 примеров найдено. Это лучшие примеры Python кода для rlgraph.components.ContainerMerger, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ContainerMerger(10)

merge(1)

Основные методы

ContainerMerger (10)

merge (1)

Пример #1

Показать файл

Файл: sac_agent.py Проект: EmpereurCC/RLgraph_exp

class SACAgentComponent(Component):
    def __init__(self,
                 agent,
                 policy,
                 q_function,
                 preprocessor,
                 memory,
                 discount,
                 initial_alpha,
                 target_entropy,
                 optimizer,
                 vf_optimizer,
                 alpha_optimizer,
                 q_sync_spec,
                 num_q_functions=2):
        super(SACAgentComponent, self).__init__(nesting_level=0)
        self.agent = agent
        self._policy = policy
        self._preprocessor = preprocessor
        self._memory = memory
        self._q_functions = [q_function]
        self._q_functions += [
            q_function.copy(scope="{}-{}".format(q_function.scope, i + 1),
                            trainable=True) for i in range(num_q_functions - 1)
        ]

        # Set number of return values for get_q_values graph_fn.
        self.graph_fn_num_outputs["_graph_fn_get_q_values"] = num_q_functions

        for q in self._q_functions:
            # TODO: is there a better way to do this?
            if "synchronizable" not in q.sub_components:
                q.add_components(Synchronizable(), expose_apis="sync")
        self._target_q_functions = [
            q.copy(scope="target-" + q.scope, trainable=True)
            for q in self._q_functions
        ]
        for target_q in self._target_q_functions:
            # TODO: is there a better way to do this?
            if "synchronizable" not in target_q.sub_components:
                target_q.add_components(Synchronizable(), expose_apis="sync")
        self._optimizer = optimizer
        self.vf_optimizer = vf_optimizer
        self.alpha_optimizer = alpha_optimizer
        self.initial_alpha = initial_alpha
        self.log_alpha = None
        self.target_entropy = target_entropy
        self.loss_function = SACLossFunction(target_entropy=target_entropy,
                                             discount=discount,
                                             num_q_functions=num_q_functions)

        memory_items = [
            "states", "actions", "rewards", "next_states", "terminals"
        ]
        self._merger = ContainerMerger(*memory_items)

        q_names = ["q_{}".format(i) for i in range(len(self._q_functions))]
        self._q_vars_merger = ContainerMerger(*q_names, scope="q_vars_merger")

        self.add_components(policy, preprocessor, memory, self._merger,
                            self.loss_function, optimizer, vf_optimizer,
                            self._q_vars_merger)  # , self._q_vars_splitter)
        self.add_components(*self._q_functions)
        self.add_components(*self._target_q_functions)
        if self.alpha_optimizer is not None:
            self.add_components(self.alpha_optimizer)

        self.steps_since_last_sync = None
        self.q_sync_spec = q_sync_spec
        self.env_action_space = None
        self.episode_reward = None

    def check_input_spaces(self, input_spaces, action_space=None):
        for s in [
                "states", "actions", "env_actions", "preprocessed_states",
                "rewards", "terminals"
        ]:
            sanity_check_space(input_spaces[s], must_have_batch_rank=True)

        self.env_action_space = input_spaces["env_actions"].flatten()

    def create_variables(self, input_spaces, action_space=None):
        self.steps_since_last_sync = self.get_variable("steps_since_last_sync",
                                                       dtype="int",
                                                       initializer=0)
        self.log_alpha = self.get_variable("log_alpha",
                                           dtype="float",
                                           initializer=np.log(
                                               self.initial_alpha))
        self.episode_reward = self.get_variable("episode_reward",
                                                shape=(),
                                                initializer=0.0)

    @rlgraph_api
    def get_policy_weights(self):
        return self._policy.variables()

    @rlgraph_api
    def get_q_weights(self):
        merged_weights = self._q_vars_merger.merge(
            *[q.variables() for q in self._q_functions])
        return merged_weights

    @rlgraph_api(must_be_complete=False)
    def set_policy_weights(self, weights):
        return self._policy.sync(weights)

    """ TODO: need to define the input space
    @rlgraph_api(must_be_complete=False)
    def set_q_weights(self, q_weights):
        split_weights = self._q_vars_splitter.call(q_weights)
        assert len(split_weights) == len(self._q_functions)
        update_ops = [q.sync(q_weights) for q_weights, q in zip(split_weights, self._q_functions)]
        update_ops.extend([q.sync(q_weights) for q_weights, q in zip(split_weights, self._target_q_functions)])
        return tuple(update_ops)
    """

    @rlgraph_api
    def preprocess_states(self, states):
        return self._preprocessor.preprocess(states)

    @rlgraph_api
    def insert_records(self, preprocessed_states, env_actions, rewards,
                       next_states, terminals):
        records = self._merger.merge(preprocessed_states, env_actions, rewards,
                                     next_states, terminals)
        return self._memory.insert_records(records)

    @rlgraph_api
    def update_from_memory(self, batch_size=64, time_percentage=None):
        records, sample_indices, importance_weights = self._memory.get_records(
            batch_size)
        result = self.update_from_external_batch(
            records["states"], records["actions"], records["rewards"],
            records["terminals"], records["next_states"], importance_weights,
            time_percentage)

        if isinstance(self._memory, PrioritizedReplay):
            update_pr_step_op = self._memory.update_records(
                sample_indices, result["critic_loss_per_item"])
            result["update_pr_step_op"] = update_pr_step_op

        return result

    @rlgraph_api
    def update_from_external_batch(self,
                                   preprocessed_states,
                                   env_actions,
                                   rewards,
                                   terminals,
                                   next_states,
                                   importance_weights,
                                   time_percentage=None):
        actions = self._graph_fn_one_hot(env_actions)
        actor_loss, actor_loss_per_item, critic_loss, critic_loss_per_item, alpha_loss, alpha_loss_per_item = \
            self.get_losses(preprocessed_states, actions, rewards, terminals, next_states, importance_weights)

        policy_vars = self._policy.variables()
        q_vars = [q_func.variables() for q_func in self._q_functions]
        merged_q_vars = self._q_vars_merger.merge(*q_vars)
        critic_step_op = self.vf_optimizer.step(merged_q_vars, critic_loss,
                                                critic_loss_per_item,
                                                time_percentage)
        actor_step_op = self._optimizer.step(policy_vars, actor_loss,
                                             actor_loss_per_item,
                                             time_percentage)

        if self.target_entropy is not None:
            alpha_step_op = self._graph_fn_update_alpha(
                alpha_loss, alpha_loss_per_item, time_percentage)
        else:
            alpha_step_op = self._graph_fn_no_op()
        # TODO: optimizer for alpha

        sync_op = self.sync_targets()

        # Increase the global training step counter.
        alpha_step_op = self._graph_fn_training_step(alpha_step_op)

        return dict(actor_step_op=actor_step_op,
                    critic_step_op=critic_step_op,
                    sync_op=sync_op,
                    alpha_step_op=alpha_step_op,
                    actor_loss=actor_loss,
                    actor_loss_per_item=actor_loss_per_item,
                    critic_loss=critic_loss,
                    critic_loss_per_item=critic_loss_per_item,
                    alpha_loss=alpha_loss,
                    alpha_loss_per_item=alpha_loss_per_item)

    @graph_fn(flatten_ops=True,
              split_ops=True,
              add_auto_key_as_first_param=True)
    def _graph_fn_one_hot(self, key, env_actions):
        if isinstance(self.env_action_space[key], IntBox):
            env_actions = tf.one_hot(
                env_actions,
                depth=self.env_action_space[key].num_categories,
                axis=-1)
        return env_actions

    @graph_fn(requires_variable_completeness=True)
    def _graph_fn_update_alpha(self,
                               alpha_loss,
                               alpha_loss_per_item,
                               time_percentage=None):
        alpha_step_op = self.alpha_optimizer.step(
            DataOpTuple([self.log_alpha]), alpha_loss, alpha_loss_per_item,
            time_percentage)
        return alpha_step_op

    @rlgraph_api  # `returns` are determined in ctor
    def _graph_fn_get_q_values(self,
                               preprocessed_states,
                               actions,
                               target=False):
        backend = get_backend()

        flat_actions = flatten_op(actions)
        actions = []
        for flat_key, action_component in self._policy.action_space.flatten(
        ).items():
            actions.append(flat_actions[flat_key])

        if backend == "tf":
            actions = tf.concat(actions, axis=-1)
        elif backend == "pytorch":
            actions = torch.cat(actions, dim=-1)

        q_funcs = self._q_functions if target is False else self._target_q_functions

        # We do not concat states yet because we might pass states through a conv stack before merging it
        # with actions.
        return tuple(
            q.state_action_value(preprocessed_states, actions)
            for q in q_funcs)

    @rlgraph_api
    def get_losses(self, preprocessed_states, actions, rewards, terminals,
                   next_states, importance_weights):
        # TODO: internal states
        samples_next = self._policy.get_action_and_log_likelihood(
            next_states, deterministic=False)
        next_sampled_actions = samples_next["action"]
        log_probs_next_sampled = samples_next["log_likelihood"]

        q_values_next_sampled = self.get_q_values(next_states,
                                                  next_sampled_actions,
                                                  target=True)
        q_values = self.get_q_values(preprocessed_states, actions)
        samples = self._policy.get_action_and_log_likelihood(
            preprocessed_states, deterministic=False)
        sampled_actions = samples["action"]
        log_probs_sampled = samples["log_likelihood"]
        q_values_sampled = self.get_q_values(preprocessed_states,
                                             sampled_actions)

        alpha = self._graph_fn_compute_alpha()

        return self.loss_function.loss(alpha, log_probs_next_sampled,
                                       q_values_next_sampled, q_values,
                                       log_probs_sampled, q_values_sampled,
                                       rewards, terminals)

    @rlgraph_api
    def get_preprocessed_state_and_action(self, states, deterministic=False):
        preprocessed_states = self._preprocessor.preprocess(states)
        return self.action_from_preprocessed_state(preprocessed_states,
                                                   deterministic)

    @rlgraph_api
    def action_from_preprocessed_state(self,
                                       preprocessed_states,
                                       deterministic=False):
        out = self._policy.get_action(preprocessed_states,
                                      deterministic=deterministic)
        return out["action"], preprocessed_states

    @rlgraph_api(requires_variable_completeness=True)
    def reset_targets(self):
        ops = (target_q.sync(q.variables()) for q, target_q in zip(
            self._q_functions, self._target_q_functions))
        return tuple(ops)

    @rlgraph_api(requires_variable_completeness=True)
    def sync_targets(self):
        should_sync = self._graph_fn_get_should_sync()
        return self._graph_fn_sync(should_sync)

    @rlgraph_api
    def get_memory_size(self):
        return self._memory.get_size()

    @graph_fn
    def _graph_fn_compute_alpha(self):
        backend = get_backend()
        if backend == "tf":
            return tf.exp(self.log_alpha)
        elif backend == "pytorch":
            return torch.exp(self.log_alpha)

    # TODO: Move this into generic AgentRootComponent.
    @graph_fn
    def _graph_fn_training_step(self, other_step_op=None):
        if self.agent is not None:
            add_op = tf.assign_add(
                self.agent.graph_executor.global_training_timestep, 1)
            op_list = [add_op] + [other_step_op
                                  ] if other_step_op is not None else []
            with tf.control_dependencies(op_list):
                return tf.no_op() if other_step_op is None else other_step_op
        else:
            return tf.no_op() if other_step_op is None else other_step_op

    @graph_fn(returns=1, requires_variable_completeness=True)
    def _graph_fn_get_should_sync(self):
        if get_backend() == "tf":
            inc_op = tf.assign_add(self.steps_since_last_sync, 1)
            should_sync = inc_op >= self.q_sync_spec.sync_interval

            def reset_op():
                op = tf.assign(self.steps_since_last_sync, 0)
                with tf.control_dependencies([op]):
                    return tf.no_op()

            sync_op = tf.cond(pred=inc_op >= self.q_sync_spec.sync_interval,
                              true_fn=reset_op,
                              false_fn=tf.no_op)
            with tf.control_dependencies([sync_op]):
                return tf.identity(should_sync)
        else:
            raise NotImplementedError("TODO")

    @graph_fn(returns=1, requires_variable_completeness=True)
    def _graph_fn_sync(self, should_sync):
        assign_ops = []
        tau = self.q_sync_spec.sync_tau
        if tau != 1.0:
            all_source_vars = [
                source.get_variables(collections=None,
                                     custom_scope_separator="-")
                for source in self._q_functions
            ]
            all_dest_vars = [
                destination.get_variables(collections=None,
                                          custom_scope_separator="-")
                for destination in self._target_q_functions
            ]
            for source_vars, dest_vars in zip(all_source_vars, all_dest_vars):
                for (source_key, source_var), (dest_key, dest_var) in zip(
                        sorted(source_vars.items()),
                        sorted(dest_vars.items())):
                    assign_ops.append(
                        tf.assign(dest_var,
                                  tau * source_var + (1.0 - tau) * dest_var))
        else:
            all_source_vars = [
                source.variables() for source in self._q_functions
            ]
            for source_vars, destination in zip(all_source_vars,
                                                self._target_q_functions):
                assign_ops.append(destination.sync(source_vars))
        assert len(assign_ops) > 0
        grouped_op = tf.group(assign_ops)

        def assign_op():
            # Make sure we are returning no_op as opposed to reference
            with tf.control_dependencies([grouped_op]):
                return tf.no_op()

        cond_assign_op = tf.cond(should_sync,
                                 true_fn=assign_op,
                                 false_fn=tf.no_op)
        with tf.control_dependencies([cond_assign_op]):
            return tf.no_op()

    @graph_fn
    def _graph_fn_no_op(self):
        return tf.no_op()

    @rlgraph_api
    def get_global_timestep(self):
        return self.read_variable(self.agent.graph_executor.global_timestep)

    @rlgraph_api
    def _graph_fn_update_global_timestep(self, increment):
        if get_backend() == "tf":
            add_op = tf.assign_add(self.agent.graph_executor.global_timestep,
                                   increment)
            return add_op
        elif get_backend == "pytorch":
            self.agent.graph_executor.global_timestep += increment
            return self.agent.graph_executor.global_timestep

    @rlgraph_api
    def _graph_fn_get_episode_reward(self):
        return self.episode_reward

    @rlgraph_api
    def _graph_fn_set_episode_reward(self, episode_reward):
        return tf.assign(self.episode_reward, episode_reward)

Пример #2

Показать файл

Файл: sac_agent.py Проект: EmpereurCC/RLgraph_exp

    def __init__(self,
                 agent,
                 policy,
                 q_function,
                 preprocessor,
                 memory,
                 discount,
                 initial_alpha,
                 target_entropy,
                 optimizer,
                 vf_optimizer,
                 alpha_optimizer,
                 q_sync_spec,
                 num_q_functions=2):
        super(SACAgentComponent, self).__init__(nesting_level=0)
        self.agent = agent
        self._policy = policy
        self._preprocessor = preprocessor
        self._memory = memory
        self._q_functions = [q_function]
        self._q_functions += [
            q_function.copy(scope="{}-{}".format(q_function.scope, i + 1),
                            trainable=True) for i in range(num_q_functions - 1)
        ]

        # Set number of return values for get_q_values graph_fn.
        self.graph_fn_num_outputs["_graph_fn_get_q_values"] = num_q_functions

        for q in self._q_functions:
            # TODO: is there a better way to do this?
            if "synchronizable" not in q.sub_components:
                q.add_components(Synchronizable(), expose_apis="sync")
        self._target_q_functions = [
            q.copy(scope="target-" + q.scope, trainable=True)
            for q in self._q_functions
        ]
        for target_q in self._target_q_functions:
            # TODO: is there a better way to do this?
            if "synchronizable" not in target_q.sub_components:
                target_q.add_components(Synchronizable(), expose_apis="sync")
        self._optimizer = optimizer
        self.vf_optimizer = vf_optimizer
        self.alpha_optimizer = alpha_optimizer
        self.initial_alpha = initial_alpha
        self.log_alpha = None
        self.target_entropy = target_entropy
        self.loss_function = SACLossFunction(target_entropy=target_entropy,
                                             discount=discount,
                                             num_q_functions=num_q_functions)

        memory_items = [
            "states", "actions", "rewards", "next_states", "terminals"
        ]
        self._merger = ContainerMerger(*memory_items)

        q_names = ["q_{}".format(i) for i in range(len(self._q_functions))]
        self._q_vars_merger = ContainerMerger(*q_names, scope="q_vars_merger")

        self.add_components(policy, preprocessor, memory, self._merger,
                            self.loss_function, optimizer, vf_optimizer,
                            self._q_vars_merger)  # , self._q_vars_splitter)
        self.add_components(*self._q_functions)
        self.add_components(*self._target_q_functions)
        if self.alpha_optimizer is not None:
            self.add_components(self.alpha_optimizer)

        self.steps_since_last_sync = None
        self.q_sync_spec = q_sync_spec
        self.env_action_space = None
        self.episode_reward = None

Пример #3

Показать файл

Файл: dqfd_agent.py Проект: lazyfunctor/rlgraph

    def __init__(self, expert_margin=0.5, supervised_weight=1.0, double_q=True, dueling_q=True,
                 huber_loss=False, n_step=1, shared_container_action_target=True,
                 memory_spec=None, demo_memory_spec=None,
                 demo_sample_ratio=0.2, store_last_memory_batch=False, store_last_q_table=False, **kwargs):
        # TODO Most of this is DQN duplicate but the way the loss function is instantiated, inheriting
        # from DQN does not work well.
        """
        Args:
            expert_margin (float): The expert margin enforces a distance in Q-values between expert action and
                all other actions.
            supervised_weight (float): Indicates weight of the expert loss.
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use.
            demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        policy_spec = kwargs.pop("policy_spec", dict())
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128
        super(DQFDAgent, self).__init__(
            policy_spec=policy_spec, name=kwargs.pop("name", "dqfd-agent"), **kwargs
        )
        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])
            )

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.demo_batch_size = int(demo_sample_ratio * self.update_spec['batch_size'] / (1.0 - demo_sample_ratio))
        self.shared_container_action_target = shared_container_action_target

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            policy_weights="variables:{}".format(self.policy.scope),
            time_step=int,
            use_exploration=bool,
            demo_batch_size=int,
            apply_demo_loss=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            next_states=preprocessed_state_space,
            preprocessed_next_states=preprocessed_state_space,
            importance_weights=weight_space
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals")

        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # Cannot have same default name.
        demo_memory_spec["scope"] = "demo-memory"
        self.demo_memory = Memory.from_spec(demo_memory_spec)

        # The splitter for splitting up the records from the memories.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states")

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy", trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQFDLossFunction(
            expert_margin=expert_margin, supervised_weight=supervised_weight,
            discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=use_importance_weights, n_step=n_step
        )

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy,
            self.target_policy, self.exploration, self.loss_function, self.optimizer
        )

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True

Пример #4

Показать файл

    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="actor-critic-agent",
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 sample_episodes=False,
                 weight_entropy=None,
                 memory_spec=None):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance
                of ValueFunction.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            sample_episodes (bool): If true, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
            a ring-buffer.
        """
        # Set policy to stochastic.
        if policy_spec is not None:
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(ActorCriticAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            value_function_spec=value_function_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            value_function_optimizer_spec=value_function_optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            name=name,
            auto_build=auto_build)
        self.sample_episodes = sample_episodes

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:{}".format(self.policy.scope),
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True)))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(self.memory, RingBuffer), \
            "ERROR: Actor-critic memory must be ring-buffer for episode-handling."
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")

        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = ActorCriticLossFunction(
            weight_entropy=weight_entropy)

        # Add all our sub-components to the core.
        sub_components = [
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.gae_function
        ]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"],
                              build_options=self.build_options)

            self.graph_built = True

Пример #5

Показать файл

Файл: actor_critic_agent.py Проект: theSoenke/rlgraph

    def __init__(self, gae_lambda=1.0, clip_rewards=0.0, sample_episodes=False,
                 weight_entropy=None, memory_spec=None, **kwargs):
        """
        Args:
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            sample_episodes (bool): If true, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
            a ring-buffer.
        """
        # Set policy to stochastic.
        if "policy_spec" in kwargs:
            policy_spec = kwargs.pop("policy_spec")
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(ActorCriticAgent, self).__init__(
            policy_spec=policy_spec,
            name=kwargs.pop("name", "actor-critic-agent"), **kwargs
        )
        self.sample_episodes = sample_episodes

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            policy_weights="variables:{}".format(self.policy.scope),
            deterministic=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            sequence_indices=BoolBox(add_batch_rank=True)
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards", "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(self.memory, RingBuffer), \
            "ERROR: Actor-critic memory must be ring-buffer for episode-handling."
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals")

        self.gae_function = GeneralizedAdvantageEstimation(gae_lambda=gae_lambda, discount=self.discount,
                                                           clip_rewards=clip_rewards)
        self.loss_function = ActorCriticLossFunction(weight_entropy=weight_entropy)

        # Add all our sub-components to the core.
        sub_components = [self.preprocessor, self.merger, self.memory, self.splitter, self.policy,
                          self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer,
                          self.gae_function]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"],
                              build_options=self.build_options)

            self.graph_built = True

Пример #6

Показать файл

    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 exploration_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="agent"):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.

            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.

            discount (float): The discount factor (gamma).

            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.

            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.

            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            value_function_spec (list): Neural network specification for baseline.

            exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.

            value_function_optimizer_spec (dict): Optimizer config for value function otpimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).

            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.

            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.

            name (str): Some name for this Agent object.
        """
        super(Agent, self).__init__()

        self.name = name
        self.auto_build = auto_build
        self.graph_built = False
        self.logger = logging.getLogger(__name__)

        self.state_space = Space.from_spec(state_space).with_batch_rank(False)
        self.flat_state_space = self.state_space.flatten() if isinstance(
            self.state_space, ContainerSpace) else None
        self.logger.info("Parsed state space definition: {}".format(
            self.state_space))
        self.action_space = Space.from_spec(action_space).with_batch_rank(
            False)
        self.flat_action_space = self.action_space.flatten() if isinstance(
            self.action_space, ContainerSpace) else None
        self.logger.info("Parsed action space definition: {}".format(
            self.action_space))

        self.discount = discount
        self.build_options = {}

        # The agent's root-Component.
        self.root_component = Component(name=self.name, nesting_level=0)

        # Define the input-Spaces:
        # Tag the input-Space to `self.set_weights` as equal to whatever the variables-Space will be for
        # the Agent's policy Component.
        self.input_spaces = dict(states=self.state_space.with_batch_rank(), )

        # Construct the Preprocessor.
        self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec)
        self.preprocessed_state_space = self.preprocessor.get_preprocessed_space(
            self.state_space)
        self.preprocessing_required = preprocessing_spec is not None and len(
            preprocessing_spec) > 0
        if self.preprocessing_required:
            self.logger.info("Preprocessing required.")
            self.logger.info(
                "Parsed preprocessed-state space definition: {}".format(
                    self.preprocessed_state_space))
        else:
            self.logger.info("No preprocessing required.")

        # Construct the Policy network.
        policy_spec = policy_spec or dict()
        if "network_spec" not in policy_spec:
            policy_spec["network_spec"] = network_spec
        if "action_space" not in policy_spec:
            policy_spec["action_space"] = self.action_space
        self.policy_spec = policy_spec
        # The behavioral policy of the algorithm. Also the one that gets updated.
        self.policy = Policy.from_spec(self.policy_spec)
        # Done by default.
        self.policy.add_components(Synchronizable(), expose_apis="sync")

        # Create non-shared baseline network.
        self.value_function = None
        if value_function_spec is not None:
            self.value_function = ValueFunction(
                network_spec=value_function_spec)
            self.value_function.add_components(Synchronizable(),
                                               expose_apis="sync")
            self.vars_merger = ContainerMerger("policy",
                                               "vf",
                                               scope="variable-dict-merger")
            self.vars_splitter = ContainerSplitter(
                "policy", "vf", scope="variable-container-splitter")
        else:
            self.vars_merger = ContainerMerger("policy",
                                               scope="variable-dict-merger")
            self.vars_splitter = ContainerSplitter(
                "policy", scope="variable-container-splitter")

        self.internal_states_space = Space.from_spec(internal_states_space)

        # An object implementing the loss function interface is only strictly needed
        # if automatic device strategies like multi-gpu are enabled. This is because
        # the device strategy needs to know the name of the loss function to infer the appropriate
        # operations.
        self.loss_function = None

        self.exploration = Exploration.from_spec(exploration_spec)
        self.execution_spec = parse_execution_spec(execution_spec)

        # Python-side experience buffer for better performance (may be disabled).
        self.default_env = "env_0"

        def factory_(i):
            if i < 2:
                return []
            return tuple([[] for _ in range(i)])

        self.states_buffer = defaultdict(
            list)  # partial(fact_, len(self.flat_state_space)))
        self.actions_buffer = defaultdict(
            partial(factory_, len(self.flat_action_space or [])))
        self.internals_buffer = defaultdict(list)
        self.rewards_buffer = defaultdict(list)
        self.next_states_buffer = defaultdict(
            list)  # partial(fact_, len(self.flat_state_space)))
        self.terminals_buffer = defaultdict(list)

        self.observe_spec = parse_observe_spec(observe_spec)

        # Global time step counter.
        self.timesteps = 0

        # Create the Agent's optimizer based on optimizer_spec and execution strategy.
        self.optimizer = None
        if optimizer_spec is not None:
            # Save spec in case agent needs to create more optimizers e.g. for baseline.
            self.optimizer_spec = optimizer_spec
            self.optimizer = Optimizer.from_spec(optimizer_spec)

        self.value_function_optimizer = None
        if self.value_function is not None:
            if value_function_optimizer_spec is None:
                vf_optimizer_spec = self.optimizer_spec
            else:
                vf_optimizer_spec = value_function_optimizer_spec
            vf_optimizer_spec["scope"] = "value-function-optimizer"
            self.value_function_optimizer = Optimizer.from_spec(
                vf_optimizer_spec)

        # Update-spec dict tells the Agent how to update (e.g. memory batch size).
        self.update_spec = parse_update_spec(update_spec)

        # Create our GraphBuilder and -Executor.
        self.graph_builder = GraphBuilder(action_space=self.action_space,
                                          summary_spec=summary_spec)
        self.graph_executor = GraphExecutor.from_spec(
            get_backend(),
            graph_builder=self.graph_builder,
            execution_spec=self.execution_spec,
            saver_spec=saver_spec)  # type: GraphExecutor

Пример #7

Показать файл

Файл: ppo_agent.py Проект: Cohencohenchen/rlgraph

    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="ppo-agent",
                 clip_ratio=0.2,
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 value_function_clipping=None,
                 standardize_advantages=False,
                 sample_episodes=True,
                 weight_entropy=None,
                 memory_spec=None):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.

            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.

            discount (float): The discount factor (gamma).

            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.

            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.

            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.

            value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance
                of ValueFunction.

            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.

            value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).

            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.

            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.

            name (str): Some name for this Agent object.
            clip_ratio (float): Clipping parameter for likelihood ratio.
            gae_lambda (float): Lambda for generalized advantage estimation.

            clip_rewards (float): Reward clipping value. If not 0, rewards will be clipped within a +/- `clip_rewards`
                range.

            value_function_clipping (Optional[float]): If not None, uses clipped value function objective. If None,
                uses simple value function objective.

            standardize_advantages (bool): If true, standardize advantage values in update.

            sample_episodes (bool): If True, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.

            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).

            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
                a ring-buffer.
        """
        if policy_spec is not None:
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(PPOAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            value_function_spec=value_function_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            value_function_optimizer_spec=value_function_optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            name=name,
            auto_build=auto_build)
        self.sample_episodes = sample_episodes

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:policy",
                 value_function_weights="variables:value-function",
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True),
                 apply_postprocessing=bool))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(
            self.memory, RingBuffer
        ), "ERROR: PPO memory must be ring-buffer for episode-handling!"

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity, \
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \
            format(self.observe_spec["buffer_size"], self.memory.capacity)

        # The splitter for splitting up the records coming from the memory.
        self.standardize_advantages = standardize_advantages
        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = PPOLossFunction(
            clip_ratio=clip_ratio,
            value_function_clipping=value_function_clipping,
            weight_entropy=weight_entropy)

        self.iterations = self.update_spec["num_iterations"]
        self.sample_size = self.update_spec["sample_size"]
        self.batch_size = self.update_spec["batch_size"]

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.policy,
            self.exploration, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.vars_merger, self.vars_splitter, self.gae_function)
        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph(
                [self.root_component],
                self.input_spaces,
                optimizer=self.optimizer,
                # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get
                # multi-gpu-split.
                batch_size=self.update_spec["sample_size"],
                build_options=self.build_options)
            self.graph_built = True

Пример #8

Показать файл

    def __init__(
        self,
        state_space,
        action_space,
        discount=0.98,
        preprocessing_spec=None,
        network_spec=None,
        internal_states_space=None,
        policy_spec=None,
        exploration_spec=None,
        execution_spec=None,
        optimizer_spec=None,
        observe_spec=None,
        update_spec=None,
        summary_spec=None,
        saver_spec=None,
        auto_build=True,
        name="dqfd-agent",
        expert_margin=0.5,
        supervised_weight=1.0,
        double_q=True,
        dueling_q=True,
        huber_loss=False,
        n_step=1,
        shared_container_action_target=False,
        memory_spec=None,
        demo_memory_spec=None,
        demo_sample_ratio=0.2,
    ):

        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            expert_margin (float): The expert margin enforces a distance in Q-values between expert action and
                all other actions.
            supervised_weight (float): Indicates weight of the expert loss.
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use.
            demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use.
        """
        # Fix action-adapter before passing it to the super constructor.
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            if policy_spec is None:
                policy_spec = {}
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128
        super(DQFDAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            exploration_spec=exploration_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            auto_build=auto_build,
            name=name
        )
        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])
            )

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.expert_margin = expert_margin

        self.batch_size = self.update_spec["batch_size"]
        self.default_margins = np.asarray([self.expert_margin] * self.batch_size)

        self.demo_batch_size = int(demo_sample_ratio * self.update_spec["batch_size"] / (1.0 - demo_sample_ratio))
        self.demo_margins = np.asarray([self.expert_margin] * self.demo_batch_size)
        self.shared_container_action_target = shared_container_action_target

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            policy_weights="variables:{}".format(self.policy.scope),
            time_step=int,
            use_exploration=bool,
            demo_batch_size=int,
            apply_demo_loss=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            expert_margins=FloatBox(add_batch_rank=True),
            next_states=preprocessed_state_space,
            preprocessed_next_states=preprocessed_state_space,
            importance_weights=weight_space
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals")

        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # Cannot have same default name.
        demo_memory_spec["scope"] = "demo-memory"
        self.demo_memory = Memory.from_spec(demo_memory_spec)

        # The splitter for splitting up the records from the memories.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states")

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy", trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        self.use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQFDLossFunction(
            supervised_weight=supervised_weight,
            discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=self.use_importance_weights, n_step=n_step
        )

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy,
            self.target_policy, self.exploration, self.loss_function, self.optimizer
        )

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True

Пример #9

Показать файл

    def __init__(self,
                 double_q=True,
                 dueling_q=True,
                 huber_loss=False,
                 n_step=1,
                 shared_container_action_target=True,
                 memory_spec=None,
                 store_last_memory_batch=False,
                 store_last_q_table=False,
                 **kwargs):
        """
        Args:
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        policy_spec = kwargs.pop("policy_spec", dict())
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128

        super(DQNAgent, self).__init__(policy_spec=policy_spec,
                                       name=kwargs.pop("name", "dqn-agent"),
                                       **kwargs)

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        #self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"],
                               self.update_spec["update_interval"]))

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.shared_container_action_target = shared_container_action_target

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(
                actions=self.action_space.with_batch_rank(),
                # weights will have a Space derived from the vars of policy.
                policy_weights="variables:{}".format(self.policy.scope),
                time_step=int,
                use_exploration=bool,
                preprocessed_states=preprocessed_state_space,
                rewards=reward_space,
                terminals=terminal_space,
                next_states=preprocessed_state_space,
                preprocessed_next_states=preprocessed_state_space,
                importance_weights=weight_space,
                apply_postprocessing=bool))
        if self.value_function is not None:
            self.input_spaces[
                "value_function_weights"] = "variables:{}".format(
                    self.value_function.scope),

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "next_states", "terminals")
        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals", "next_states")

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity,\
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\
            format(self.observe_spec["buffer_size"], self.memory.capacity)

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy",
                                              trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQNLossFunction(
            discount=self.discount,
            double_q=self.double_q,
            huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=use_importance_weights,
            n_step=n_step)

        self.root_component.add_components(
            self.preprocessor,
            self.merger,
            self.memory,
            self.splitter,
            self.policy,
            self.target_policy,
            self.value_function,
            self.value_function_optimizer,  # <- should both be None for DQN
            self.exploration,
            self.loss_function,
            self.optimizer,
            self.vars_merger,
            self.vars_splitter)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        # markup = get_graph_markup(self.graph_builder.root_component)
        # print(markup)
        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True

Пример #10

Показать файл

Файл: ppo_agent.py Проект: theSoenke/rlgraph

    def __init__(self,
                 clip_ratio=0.2,
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 standardize_advantages=False,
                 sample_episodes=True,
                 weight_entropy=None,
                 memory_spec=None,
                 **kwargs):
        """
        Args:
            clip_ratio (float): Clipping parameter for likelihood ratio.
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            standardize_advantages (bool): If true, standardize advantage values in update.

            sample_episodes (bool): If True, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.

            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).

            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
                a ring-buffer.
        """
        if "policy_spec" in kwargs:
            policy_spec = kwargs.pop("policy_spec")
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(PPOAgent, self).__init__(
            policy_spec=policy_spec,  # Set policy to stochastic.
            name=kwargs.pop("name", "ppo-agent"),
            **kwargs)
        self.sample_episodes = sample_episodes

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:policy",
                 value_function_weights="variables:value-function",
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True),
                 apply_postprocessing=bool))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(
            self.memory, RingBuffer
        ), "ERROR: PPO memory must be ring-buffer for episode-handling!"

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity, \
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \
                format(self.observe_spec["buffer_size"], self.memory.capacity)

        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")
        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = PPOLossFunction(
            clip_ratio=clip_ratio,
            standardize_advantages=standardize_advantages,
            weight_entropy=weight_entropy)

        self.iterations = self.update_spec["num_iterations"]
        self.sample_size = self.update_spec["sample_size"]
        self.batch_size = self.update_spec["batch_size"]

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.exploration, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.vars_merger, self.vars_splitter, self.gae_function)
        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph(
                [self.root_component],
                self.input_spaces,
                optimizer=self.optimizer,
                # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get
                # multi-gpu-split.
                batch_size=self.update_spec["sample_size"],
                build_options=self.build_options)
            self.graph_built = True

Пример #11

Показать файл

Файл: dqn_agent.py Проект: Cohencohenchen/rlgraph

    def __init__(
        self,
        state_space,
        action_space,
        discount=0.98,
        preprocessing_spec=None,
        network_spec=None,
        internal_states_space=None,
        policy_spec=None,
        exploration_spec=None,
        execution_spec=None,
        optimizer_spec=None,
        observe_spec=None,
        update_spec=None,
        summary_spec=None,
        saver_spec=None,
        auto_build=True,
        name="dqn-agent",
        double_q=True,
        dueling_q=True,
        huber_loss=False,
        n_step=1,
        shared_container_action_target=True,
        memory_spec=None,
        store_last_memory_batch=False,
        store_last_q_table=False,
    ):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128

        super(DQNAgent,
              self).__init__(state_space=state_space,
                             action_space=action_space,
                             discount=discount,
                             preprocessing_spec=preprocessing_spec,
                             network_spec=network_spec,
                             internal_states_space=internal_states_space,
                             policy_spec=policy_spec,
                             exploration_spec=exploration_spec,
                             execution_spec=execution_spec,
                             optimizer_spec=optimizer_spec,
                             observe_spec=observe_spec,
                             update_spec=update_spec,
                             summary_spec=summary_spec,
                             saver_spec=saver_spec,
                             auto_build=auto_build,
                             name=name)

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"],
                               self.update_spec["update_interval"]))

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.shared_container_action_target = shared_container_action_target

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(
                actions=self.action_space.with_batch_rank(),
                # Weights will have a Space derived from the vars of policy.
                policy_weights="variables:{}".format(self.policy.scope),
                time_step=int,
                use_exploration=bool,
                preprocessed_states=preprocessed_state_space,
                rewards=reward_space,
                terminals=terminal_space,
                next_states=preprocessed_state_space,
                preprocessed_next_states=preprocessed_state_space,
                importance_weights=weight_space,
                apply_postprocessing=bool))
        if self.value_function is not None:
            self.input_spaces[
                "value_function_weights"] = "variables:{}".format(
                    self.value_function.scope),

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "next_states", "terminals")
        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals", "next_states")

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity,\
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\
            format(self.observe_spec["buffer_size"], self.memory.capacity)

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy",
                                              trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQNLossFunction(
            discount=self.discount,
            double_q=self.double_q,
            huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=use_importance_weights,
            n_step=n_step)

        self.root_component.add_components(
            self.preprocessor,
            self.merger,
            self.memory,
            self.splitter,
            self.policy,
            self.target_policy,
            self.value_function,
            self.value_function_optimizer,  # <- should both be None for DQN
            self.exploration,
            self.loss_function,
            self.optimizer,
            self.vars_merger,
            self.vars_splitter)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        # markup = get_graph_markup(self.graph_builder.root_component)
        # print(markup)
        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True