def _create_action_adapters_and_distributions(self, action_space, action_adapter_spec): if action_space is None: adapter = ActionAdapter.from_spec(action_adapter_spec) self.action_space = adapter.action_space # Assert single component action space. assert len(self.action_space.flatten()) == 1,\ "ERROR: Action space must not be ContainerSpace if no `action_space` is given in Policy c'tor!" else: self.action_space = Space.from_spec(action_space) # Figure out our Distributions. for i, (flat_key, action_component) in enumerate(self.action_space.flatten().items()): distribution = self.distributions[flat_key] = self._get_distribution(i, action_component) if distribution is None: raise RLGraphError("ERROR: `action_component` is of type {} and not allowed in {} Component!". format(type(action_space).__name__, self.name)) action_adapter_type = distribution.get_action_adapter_type() # Spec dict. if isinstance(action_adapter_spec, dict): aa_spec = action_adapter_spec.get(flat_key, action_adapter_spec) aa_spec["type"] = action_adapter_type aa_spec["action_space"] = action_component # Simple type spec. elif not isinstance(action_adapter_spec, ActionAdapter): aa_spec = dict(type=action_adapter_type, action_space=action_component) # Direct object. else: aa_spec = action_adapter_spec self.action_adapters[flat_key] = ActionAdapter.from_spec(aa_spec, scope="action-adapter-{}".format(i))
def __init__(self, state_space, action_space, seed=None): """ Args: state_space (Union[dict,Space]): The spec-dict for generating the state Space or the state Space object itself. action_space (Union[dict,Space]): The spec-dict for generating the action Space or the action Space object itself. #reward_clipping (Optionalp[Tuple[float,float],float]: An optional reward clipping setting used # to restrict all rewards produced by the Environment to be in a certain range. # None for no clipping. Single float for clipping between -`reward_clipping` and +`reward_clipping`. """ super(Environment, self).__init__() self.state_space = Space.from_spec(state_space) self.action_space = Space.from_spec(action_space) # self.reward_clipping = reward_clipping # Add some seeding to the created Env. if seed is not None: self.seed(seed)
def _create_action_adapters_and_distributions(self, action_space, action_adapter_spec): if action_space is None: adapter = ActionAdapter.from_spec(action_adapter_spec) self.action_space = adapter.action_space # Assert single component action space. assert len(self.action_space.flatten()) == 1, \ "ERROR: Action space must not be ContainerSpace if no `action_space` is given in Policy constructor!" else: self.action_space = Space.from_spec(action_space) # Figure out our Distributions. for i, (flat_key, action_component) in enumerate( self.action_space.flatten().items()): # Spec dict. if isinstance(action_adapter_spec, dict): aa_spec = flat_key_lookup(action_adapter_spec, flat_key, action_adapter_spec) aa_spec["action_space"] = action_component # Simple type spec. elif not isinstance(action_adapter_spec, ActionAdapter): aa_spec = dict(action_space=action_component) # Direct object. else: aa_spec = action_adapter_spec if isinstance(aa_spec, dict) and "type" not in aa_spec: dist_spec = get_default_distribution_from_space( action_component, self.bounded_distribution_type, self.discrete_distribution_type, self.gumbel_softmax_temperature) self.distributions[flat_key] = Distribution.from_spec( dist_spec, scope="{}-{}".format(dist_spec["type"], i)) if self.distributions[flat_key] is None: raise RLGraphError( "ERROR: `action_component` is of type {} and not allowed in {} Component!" .format(type(action_space).__name__, self.name)) aa_spec[ "type"] = get_action_adapter_type_from_distribution_type( type(self.distributions[flat_key]).__name__) self.action_adapters[flat_key] = ActionAdapter.from_spec( aa_spec, scope="action-adapter-{}".format(i)) else: self.action_adapters[flat_key] = ActionAdapter.from_spec( aa_spec, scope="action-adapter-{}".format(i)) dist_spec = get_distribution_spec_from_action_adapter( self.action_adapters[flat_key]) self.distributions[flat_key] = Distribution.from_spec( dist_spec, scope="{}-{}".format(dist_spec["type"], i))
def __init__(self, environment_spec, actor_component_spec, num_steps=20, state_space=None, reward_space=None, internal_states_space=None, add_action_probs=False, action_probs_space=None, add_action=False, add_reward=False, add_previous_action_to_state=False, add_previous_reward_to_state=False, scope="environment-stepper", **kwargs): """ Args: environment_spec (dict): A specification dict for constructing an Environment object that will be run inside a SpecifiableServer for in-graph stepping. actor_component_spec (Union[ActorComponent,dict]): A specification dict to construct this EnvStepper's ActionComponent (to generate actions) or an already constructed ActionComponent object. num_steps (int): The number of steps to perform per `step` call. state_space (Optional[Space]): The state Space of the Environment. If None, will construct a dummy environment to get the state Space from there. reward_space (Optional[Space]): The reward Space of the Environment. If None, will construct a dummy environment to get the reward Space from there. internal_states_space (Optional[Space]): The internal states Space (when using an RNN inside the ActorComponent). add_action_probs (bool): Whether to add all action probabilities for each step to the ActionComponent's outputs at each step. These will be added as additional tensor inside the Default: False. action_probs_space (Optional[Space]): If add_action_probs is True, the Space that the action_probs will have. This is usually just the flattened (one-hot) action space. add_action (bool): Whether to add the action to the output of the `step` API-method. Default: False. add_reward (bool): Whether to add the reward to the output of the `step` API-method. Default: False. add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: False. add_previous_action_to_state (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: False. add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: False. """ super(EnvironmentStepper, self).__init__(scope=scope, **kwargs) # Only to retrieve some information about the particular Env. dummy_env = Environment.from_spec( environment_spec) # type: Environment # Create the SpecifiableServer with the given env spec. if state_space is None or reward_space is None: state_space = dummy_env.state_space if reward_space is None: _, reward, _, _ = dummy_env.step( dummy_env.action_space.sample()) # TODO: this may break on non 64-bit machines. tf seems to interpret a python float as tf.float64. reward_space = Space.from_spec( "float64" if type(reward) == float else float, shape=(1, )).with_batch_rank() else: reward_space = Space.from_spec(reward_space).with_batch_rank() self.reward_space = reward_space self.action_space = dummy_env.action_space dummy_env.terminate() # The state that the environment produces. self.state_space_env = state_space # The state that must be fed into the actor-component to produce an action. # May contain prev_action and prev_reward. self.state_space_actor = state_space self.add_previous_action_to_state = add_previous_action_to_state self.add_previous_reward_to_state = add_previous_reward_to_state # Circle actions and/or rewards in `step` API-method? self.add_action = add_action self.add_reward = add_reward # The Problem with ContainerSpaces here is that py_func (SpecifiableServer) cannot handle container # spaces, which is why we need to painfully convert these into flat spaces and tuples here whenever # we make a call to the env. So to keep things unified, we treat all container spaces # (state space, preprocessed state) from here on as tuples of primitive spaces sorted by their would be # flat-keys in a flattened dict). self.state_space_env_flattened = self.state_space_env.flatten() # Need to flatten the state-space in case it's a ContainerSpace for the return dtypes. self.state_space_env_list = list( self.state_space_env_flattened.values()) # TODO: automate this by lookup from the NN Component self.internal_states_space = None if internal_states_space is not None: self.internal_states_space = internal_states_space.with_batch_rank( add_batch_rank=1) # Add the action/reward spaces to the state space (must be Dict). if self.add_previous_action_to_state is True: assert isinstance(self.state_space_actor, Dict),\ "ERROR: If `add_previous_action_to_state` is True as input, state_space must be a Dict!" self.state_space_actor["previous_action"] = self.action_space if self.add_previous_reward_to_state is True: assert isinstance(self.state_space_actor, Dict),\ "ERROR: If `add_previous_reward_to_state` is True as input, state_space must be a Dict!" self.state_space_actor["previous_reward"] = self.reward_space self.state_space_actor_flattened = self.state_space_actor.flatten() self.state_space_actor_list = list( self.state_space_actor_flattened.values()) self.add_action_probs = add_action_probs self.action_probs_space = action_probs_space self.environment_spec = environment_spec self.environment_server = SpecifiableServer( class_=Environment, spec=environment_spec, output_spaces=dict( step_for_env_stepper=self.state_space_env_list + [self.reward_space, bool], reset_for_env_stepper=self.state_space_env_list), shutdown_method="terminate") # Add the sub-components. self.actor_component = ActorComponent.from_spec( actor_component_spec) # type: ActorComponent self.preprocessed_state_space = self.actor_component.preprocessor.get_preprocessed_space( self.state_space_actor) self.num_steps = num_steps # Variables that hold information of last step through Env. self.current_terminal = None self.current_state = None self.current_action = None # Only if self.add_action is True. self.current_reward = None # Only if self.add_reward is True. self.current_internal_states = None self.current_action_probs = None self.time_step = 0 self.has_rnn = self.actor_component.policy.neural_network.has_rnn() # Add all sub-components (only ActorComponent). self.add_components(self.actor_component)
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent"): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list): Neural network specification for baseline. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function otpimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.flat_state_space = self.state_space.flatten() if isinstance( self.state_space, ContainerSpace) else None self.logger.info("Parsed state space definition: {}".format( self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank( False) self.flat_action_space = self.action_space.flatten() if isinstance( self.action_space, ContainerSpace) else None self.logger.info("Parsed action space definition: {}".format( self.action_space)) self.discount = discount self.build_options = {} # The agent's root-Component. self.root_component = Component(name=self.name, nesting_level=0) # Define the input-Spaces: # Tag the input-Space to `self.set_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict(states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space( self.state_space) self.preprocessing_required = preprocessing_spec is not None and len( preprocessing_spec) > 0 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info( "Parsed preprocessed-state space definition: {}".format( self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. policy_spec = policy_spec or dict() if "network_spec" not in policy_spec: policy_spec["network_spec"] = network_spec if "action_space" not in policy_spec: policy_spec["action_space"] = self.action_space self.policy_spec = policy_spec # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy.from_spec(self.policy_spec) # Done by default. self.policy.add_components(Synchronizable(), expose_apis="sync") # Create non-shared baseline network. self.value_function = None if value_function_spec is not None: self.value_function = ValueFunction( network_spec=value_function_spec) self.value_function.add_components(Synchronizable(), expose_apis="sync") self.vars_merger = ContainerMerger("policy", "vf", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", "vf", scope="variable-container-splitter") else: self.vars_merger = ContainerMerger("policy", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", scope="variable-container-splitter") self.internal_states_space = Space.from_spec(internal_states_space) # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" def factory_(i): if i < 2: return [] return tuple([[] for _ in range(i)]) self.states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.actions_buffer = defaultdict( partial(factory_, len(self.flat_action_space or []))) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: # Save spec in case agent needs to create more optimizers e.g. for baseline. self.optimizer_spec = optimizer_spec self.optimizer = Optimizer.from_spec(optimizer_spec) self.value_function_optimizer = None if self.value_function is not None: if value_function_optimizer_spec is None: vf_optimizer_spec = self.optimizer_spec else: vf_optimizer_spec = value_function_optimizer_spec vf_optimizer_spec["scope"] = "value-function-optimizer" self.value_function_optimizer = Optimizer.from_spec( vf_optimizer_spec) # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec) # type: GraphExecutor