示例#1
0
    def _create_action_adapters_and_distributions(self, action_space, action_adapter_spec):
        if action_space is None:
            adapter = ActionAdapter.from_spec(action_adapter_spec)
            self.action_space = adapter.action_space
            # Assert single component action space.
            assert len(self.action_space.flatten()) == 1,\
                "ERROR: Action space must not be ContainerSpace if no `action_space` is given in Policy c'tor!"
        else:
            self.action_space = Space.from_spec(action_space)

        # Figure out our Distributions.
        for i, (flat_key, action_component) in enumerate(self.action_space.flatten().items()):
            distribution = self.distributions[flat_key] = self._get_distribution(i, action_component)
            if distribution is None:
                raise RLGraphError("ERROR: `action_component` is of type {} and not allowed in {} Component!".
                                   format(type(action_space).__name__, self.name))
            action_adapter_type = distribution.get_action_adapter_type()
            # Spec dict.
            if isinstance(action_adapter_spec, dict):
                aa_spec = action_adapter_spec.get(flat_key, action_adapter_spec)
                aa_spec["type"] = action_adapter_type
                aa_spec["action_space"] = action_component
            # Simple type spec.
            elif not isinstance(action_adapter_spec, ActionAdapter):
                aa_spec = dict(type=action_adapter_type, action_space=action_component)
            # Direct object.
            else:
                aa_spec = action_adapter_spec
            self.action_adapters[flat_key] = ActionAdapter.from_spec(aa_spec, scope="action-adapter-{}".format(i))
示例#2
0
    def __init__(self, state_space, action_space, seed=None):
        """
        Args:
            state_space (Union[dict,Space]): The spec-dict for generating the state Space or the state Space object
                itself.
            action_space (Union[dict,Space]): The spec-dict for generating the action Space or the action Space object
                itself.
            #reward_clipping (Optionalp[Tuple[float,float],float]: An optional reward clipping setting used
            #    to restrict all rewards produced by the Environment to be in a certain range.
            #    None for no clipping. Single float for clipping between -`reward_clipping` and +`reward_clipping`.
        """
        super(Environment, self).__init__()

        self.state_space = Space.from_spec(state_space)
        self.action_space = Space.from_spec(action_space)
        # self.reward_clipping = reward_clipping

        # Add some seeding to the created Env.
        if seed is not None:
            self.seed(seed)
示例#3
0
    def _create_action_adapters_and_distributions(self, action_space,
                                                  action_adapter_spec):
        if action_space is None:
            adapter = ActionAdapter.from_spec(action_adapter_spec)
            self.action_space = adapter.action_space
            # Assert single component action space.
            assert len(self.action_space.flatten()) == 1, \
                "ERROR: Action space must not be ContainerSpace if no `action_space` is given in Policy constructor!"
        else:
            self.action_space = Space.from_spec(action_space)

        # Figure out our Distributions.
        for i, (flat_key, action_component) in enumerate(
                self.action_space.flatten().items()):
            # Spec dict.
            if isinstance(action_adapter_spec, dict):
                aa_spec = flat_key_lookup(action_adapter_spec, flat_key,
                                          action_adapter_spec)
                aa_spec["action_space"] = action_component
            # Simple type spec.
            elif not isinstance(action_adapter_spec, ActionAdapter):
                aa_spec = dict(action_space=action_component)
            # Direct object.
            else:
                aa_spec = action_adapter_spec

            if isinstance(aa_spec, dict) and "type" not in aa_spec:
                dist_spec = get_default_distribution_from_space(
                    action_component, self.bounded_distribution_type,
                    self.discrete_distribution_type,
                    self.gumbel_softmax_temperature)

                self.distributions[flat_key] = Distribution.from_spec(
                    dist_spec, scope="{}-{}".format(dist_spec["type"], i))
                if self.distributions[flat_key] is None:
                    raise RLGraphError(
                        "ERROR: `action_component` is of type {} and not allowed in {} Component!"
                        .format(type(action_space).__name__, self.name))
                aa_spec[
                    "type"] = get_action_adapter_type_from_distribution_type(
                        type(self.distributions[flat_key]).__name__)
                self.action_adapters[flat_key] = ActionAdapter.from_spec(
                    aa_spec, scope="action-adapter-{}".format(i))
            else:
                self.action_adapters[flat_key] = ActionAdapter.from_spec(
                    aa_spec, scope="action-adapter-{}".format(i))
                dist_spec = get_distribution_spec_from_action_adapter(
                    self.action_adapters[flat_key])
                self.distributions[flat_key] = Distribution.from_spec(
                    dist_spec, scope="{}-{}".format(dist_spec["type"], i))
示例#4
0
    def __init__(self,
                 environment_spec,
                 actor_component_spec,
                 num_steps=20,
                 state_space=None,
                 reward_space=None,
                 internal_states_space=None,
                 add_action_probs=False,
                 action_probs_space=None,
                 add_action=False,
                 add_reward=False,
                 add_previous_action_to_state=False,
                 add_previous_reward_to_state=False,
                 scope="environment-stepper",
                 **kwargs):
        """
        Args:
            environment_spec (dict): A specification dict for constructing an Environment object that will be run
                inside a SpecifiableServer for in-graph stepping.
            actor_component_spec (Union[ActorComponent,dict]): A specification dict to construct this EnvStepper's
                ActionComponent (to generate actions) or an already constructed ActionComponent object.
            num_steps (int): The number of steps to perform per `step` call.
            state_space (Optional[Space]): The state Space of the Environment. If None, will construct a dummy
                environment to get the state Space from there.
            reward_space (Optional[Space]): The reward Space of the Environment. If None, will construct a dummy
                environment to get the reward Space from there.
            internal_states_space (Optional[Space]): The internal states Space (when using an RNN inside the
                ActorComponent).
            add_action_probs (bool): Whether to add all action probabilities for each step to the ActionComponent's
                outputs at each step. These will be added as additional tensor inside the
                Default: False.
            action_probs_space (Optional[Space]): If add_action_probs is True, the Space that the action_probs will have.
                This is usually just the flattened (one-hot) action space.
            add_action (bool): Whether to add the action to the output of the `step` API-method.
                Default: False.
            add_reward (bool): Whether to add the reward to the output of the `step` API-method.
                Default: False.
            add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: False.
            add_previous_action_to_state (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: False.
            add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: False.
        """
        super(EnvironmentStepper, self).__init__(scope=scope, **kwargs)

        # Only to retrieve some information about the particular Env.
        dummy_env = Environment.from_spec(
            environment_spec)  # type: Environment

        # Create the SpecifiableServer with the given env spec.
        if state_space is None or reward_space is None:
            state_space = dummy_env.state_space
            if reward_space is None:
                _, reward, _, _ = dummy_env.step(
                    dummy_env.action_space.sample())
                # TODO: this may break on non 64-bit machines. tf seems to interpret a python float as tf.float64.
                reward_space = Space.from_spec(
                    "float64" if type(reward) == float else float,
                    shape=(1, )).with_batch_rank()
        else:
            reward_space = Space.from_spec(reward_space).with_batch_rank()

        self.reward_space = reward_space
        self.action_space = dummy_env.action_space

        dummy_env.terminate()

        # The state that the environment produces.
        self.state_space_env = state_space
        # The state that must be fed into the actor-component to produce an action.
        # May contain prev_action and prev_reward.
        self.state_space_actor = state_space
        self.add_previous_action_to_state = add_previous_action_to_state
        self.add_previous_reward_to_state = add_previous_reward_to_state

        # Circle actions and/or rewards in `step` API-method?
        self.add_action = add_action
        self.add_reward = add_reward

        # The Problem with ContainerSpaces here is that py_func (SpecifiableServer) cannot handle container
        # spaces, which is why we need to painfully convert these into flat spaces and tuples here whenever
        # we make a call to the env. So to keep things unified, we treat all container spaces
        # (state space, preprocessed state) from here on as tuples of primitive spaces sorted by their would be
        # flat-keys in a flattened dict).
        self.state_space_env_flattened = self.state_space_env.flatten()
        # Need to flatten the state-space in case it's a ContainerSpace for the return dtypes.
        self.state_space_env_list = list(
            self.state_space_env_flattened.values())

        # TODO: automate this by lookup from the NN Component
        self.internal_states_space = None
        if internal_states_space is not None:
            self.internal_states_space = internal_states_space.with_batch_rank(
                add_batch_rank=1)

        # Add the action/reward spaces to the state space (must be Dict).
        if self.add_previous_action_to_state is True:
            assert isinstance(self.state_space_actor, Dict),\
                "ERROR: If `add_previous_action_to_state` is True as input, state_space must be a Dict!"
            self.state_space_actor["previous_action"] = self.action_space
        if self.add_previous_reward_to_state is True:
            assert isinstance(self.state_space_actor, Dict),\
                "ERROR: If `add_previous_reward_to_state` is True as input, state_space must be a Dict!"
            self.state_space_actor["previous_reward"] = self.reward_space
        self.state_space_actor_flattened = self.state_space_actor.flatten()
        self.state_space_actor_list = list(
            self.state_space_actor_flattened.values())

        self.add_action_probs = add_action_probs
        self.action_probs_space = action_probs_space

        self.environment_spec = environment_spec
        self.environment_server = SpecifiableServer(
            class_=Environment,
            spec=environment_spec,
            output_spaces=dict(
                step_for_env_stepper=self.state_space_env_list +
                [self.reward_space, bool],
                reset_for_env_stepper=self.state_space_env_list),
            shutdown_method="terminate")
        # Add the sub-components.
        self.actor_component = ActorComponent.from_spec(
            actor_component_spec)  # type: ActorComponent
        self.preprocessed_state_space = self.actor_component.preprocessor.get_preprocessed_space(
            self.state_space_actor)

        self.num_steps = num_steps

        # Variables that hold information of last step through Env.
        self.current_terminal = None
        self.current_state = None
        self.current_action = None  # Only if self.add_action is True.
        self.current_reward = None  # Only if self.add_reward is True.
        self.current_internal_states = None
        self.current_action_probs = None
        self.time_step = 0

        self.has_rnn = self.actor_component.policy.neural_network.has_rnn()

        # Add all sub-components (only ActorComponent).
        self.add_components(self.actor_component)
示例#5
0
    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 exploration_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="agent"):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.

            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.

            discount (float): The discount factor (gamma).

            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.

            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.

            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            value_function_spec (list): Neural network specification for baseline.

            exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.

            value_function_optimizer_spec (dict): Optimizer config for value function otpimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).

            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.

            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.

            name (str): Some name for this Agent object.
        """
        super(Agent, self).__init__()

        self.name = name
        self.auto_build = auto_build
        self.graph_built = False
        self.logger = logging.getLogger(__name__)

        self.state_space = Space.from_spec(state_space).with_batch_rank(False)
        self.flat_state_space = self.state_space.flatten() if isinstance(
            self.state_space, ContainerSpace) else None
        self.logger.info("Parsed state space definition: {}".format(
            self.state_space))
        self.action_space = Space.from_spec(action_space).with_batch_rank(
            False)
        self.flat_action_space = self.action_space.flatten() if isinstance(
            self.action_space, ContainerSpace) else None
        self.logger.info("Parsed action space definition: {}".format(
            self.action_space))

        self.discount = discount
        self.build_options = {}

        # The agent's root-Component.
        self.root_component = Component(name=self.name, nesting_level=0)

        # Define the input-Spaces:
        # Tag the input-Space to `self.set_weights` as equal to whatever the variables-Space will be for
        # the Agent's policy Component.
        self.input_spaces = dict(states=self.state_space.with_batch_rank(), )

        # Construct the Preprocessor.
        self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec)
        self.preprocessed_state_space = self.preprocessor.get_preprocessed_space(
            self.state_space)
        self.preprocessing_required = preprocessing_spec is not None and len(
            preprocessing_spec) > 0
        if self.preprocessing_required:
            self.logger.info("Preprocessing required.")
            self.logger.info(
                "Parsed preprocessed-state space definition: {}".format(
                    self.preprocessed_state_space))
        else:
            self.logger.info("No preprocessing required.")

        # Construct the Policy network.
        policy_spec = policy_spec or dict()
        if "network_spec" not in policy_spec:
            policy_spec["network_spec"] = network_spec
        if "action_space" not in policy_spec:
            policy_spec["action_space"] = self.action_space
        self.policy_spec = policy_spec
        # The behavioral policy of the algorithm. Also the one that gets updated.
        self.policy = Policy.from_spec(self.policy_spec)
        # Done by default.
        self.policy.add_components(Synchronizable(), expose_apis="sync")

        # Create non-shared baseline network.
        self.value_function = None
        if value_function_spec is not None:
            self.value_function = ValueFunction(
                network_spec=value_function_spec)
            self.value_function.add_components(Synchronizable(),
                                               expose_apis="sync")
            self.vars_merger = ContainerMerger("policy",
                                               "vf",
                                               scope="variable-dict-merger")
            self.vars_splitter = ContainerSplitter(
                "policy", "vf", scope="variable-container-splitter")
        else:
            self.vars_merger = ContainerMerger("policy",
                                               scope="variable-dict-merger")
            self.vars_splitter = ContainerSplitter(
                "policy", scope="variable-container-splitter")

        self.internal_states_space = Space.from_spec(internal_states_space)

        # An object implementing the loss function interface is only strictly needed
        # if automatic device strategies like multi-gpu are enabled. This is because
        # the device strategy needs to know the name of the loss function to infer the appropriate
        # operations.
        self.loss_function = None

        self.exploration = Exploration.from_spec(exploration_spec)
        self.execution_spec = parse_execution_spec(execution_spec)

        # Python-side experience buffer for better performance (may be disabled).
        self.default_env = "env_0"

        def factory_(i):
            if i < 2:
                return []
            return tuple([[] for _ in range(i)])

        self.states_buffer = defaultdict(
            list)  # partial(fact_, len(self.flat_state_space)))
        self.actions_buffer = defaultdict(
            partial(factory_, len(self.flat_action_space or [])))
        self.internals_buffer = defaultdict(list)
        self.rewards_buffer = defaultdict(list)
        self.next_states_buffer = defaultdict(
            list)  # partial(fact_, len(self.flat_state_space)))
        self.terminals_buffer = defaultdict(list)

        self.observe_spec = parse_observe_spec(observe_spec)

        # Global time step counter.
        self.timesteps = 0

        # Create the Agent's optimizer based on optimizer_spec and execution strategy.
        self.optimizer = None
        if optimizer_spec is not None:
            # Save spec in case agent needs to create more optimizers e.g. for baseline.
            self.optimizer_spec = optimizer_spec
            self.optimizer = Optimizer.from_spec(optimizer_spec)

        self.value_function_optimizer = None
        if self.value_function is not None:
            if value_function_optimizer_spec is None:
                vf_optimizer_spec = self.optimizer_spec
            else:
                vf_optimizer_spec = value_function_optimizer_spec
            vf_optimizer_spec["scope"] = "value-function-optimizer"
            self.value_function_optimizer = Optimizer.from_spec(
                vf_optimizer_spec)

        # Update-spec dict tells the Agent how to update (e.g. memory batch size).
        self.update_spec = parse_update_spec(update_spec)

        # Create our GraphBuilder and -Executor.
        self.graph_builder = GraphBuilder(action_space=self.action_space,
                                          summary_spec=summary_spec)
        self.graph_executor = GraphExecutor.from_spec(
            get_backend(),
            graph_builder=self.graph_builder,
            execution_spec=self.execution_spec,
            saver_spec=saver_spec)  # type: GraphExecutor