def __init__(self, clip_ratio, memory_spec=None, **kwargs): """ Args: memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the PPO algorithm. """ super(PPOAgent, self).__init__(name=kwargs.pop("name", "ppo-agent"), **kwargs) self.train_time_steps = 0 # PPO uses a ring buffer. self.memory = Memory.from_spec(memory_spec) self.record_space = Dict(states=self.state_space, actions=self.action_space, rewards=float, terminals=BoolBox(), add_batch_rank=False) self.policy = Policy(network_spec=self.neural_network, action_adapter_spec=None) self.merger = DictMerger(output_space=self.record_space) splitter_input_space = copy.deepcopy(self.record_space) self.splitter = ContainerSplitter(input_space=splitter_input_space) self.loss_function = PPOLossFunction(clip_ratio=clip_ratio, discount=self.discount) self.define_graph_api() if self.auto_build: self._build_graph() self.graph_built = True
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="actor-critic-agent", gae_lambda=1.0, clip_rewards=0.0, sample_episodes=False, weight_entropy=None, memory_spec=None): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance of ValueFunction. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range. sample_episodes (bool): If true, the update method interprets the batch_size as the number of episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ # Set policy to stochastic. if policy_spec is not None: policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(ActorCriticAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, value_function_spec=value_function_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, value_function_optimizer_spec=value_function_optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, name=name, auto_build=auto_build) self.sample_episodes = sample_episodes # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True))) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance(self.memory, RingBuffer), \ "ERROR: Actor-critic memory must be ring-buffer for episode-handling." # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = ActorCriticLossFunction( weight_entropy=weight_entropy) # Add all our sub-components to the core. sub_components = [ self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.gae_function ] self.root_component.add_components(*sub_components) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, double_q=True, dueling_q=True, huber_loss=False, n_step=1, memory_spec=None, store_last_memory_batch=False, store_last_q_table=False, **kwargs): """ Args: double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm. store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in `self.last_memory_batch` for debugging purposes. Default: False. store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch (memory or external) in `self.last_q_table` for debugging purposes. Default: False. """ # Fix action-adapter before passing it to the super constructor. action_adapter_spec = kwargs.pop("action_adapter_spec", dict()) # Use a DuelingActionAdapter (instead of a basic ActionAdapter) if option is set. if dueling_q is True: action_adapter_spec["type"] = "dueling-action-adapter" assert "units_state_value_stream" in action_adapter_spec assert "units_advantage_stream" in action_adapter_spec super(DQNAgent, self).__init__( action_adapter_spec=action_adapter_spec, name=kwargs.pop("name", "dqn-agent"), **kwargs ) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss # Debugging tools. self.store_last_memory_batch = store_last_memory_batch self.last_memory_batch = None self.store_last_q_table = store_last_q_table self.last_q_table = None # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank() reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update(dict( actions=self.action_space.with_batch_rank(), weights="variables:policy", time_step=int, use_exploration=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space, # TODO: This is currently necessary for multi-GPU handling (as the update_from_external_batch # TODO: gets overridden by a generic function with args=*inputs) #inputs=[preprocessed_state_space, self.action_space.with_batch_rank(), reward_space, terminal_space, # preprocessed_state_space, weight_space] )) # The merger to merge inputs into one record Dict going into the memory. self.merger = DictMerger("states", "actions", "rewards", "next_states", "terminals") # The replay memory. self.memory = Memory.from_spec(memory_spec) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) self.target_policy.add_components(Synchronizable(), expose_apis="sync") # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQNLossFunction( discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, importance_weights=use_importance_weights, n_step=n_step ) # Add all our sub-components to the core. sub_components = [self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.target_policy, self.exploration, self.loss_function, self.optimizer] self.root_component.add_components(*sub_components) # Define the Agent's (root-Component's) API. self.define_graph_api("policy", "preprocessor-stack", self.optimizer.scope, *sub_components) # markup = get_graph_markup(self.graph_builder.root_component) # print(markup) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True
def __init__(self, gae_lambda=1.0, sample_episodes=False, weight_entropy=None, memory_spec=None, **kwargs): """ Args: gae_lambda (float): Lambda for generalized advantage estimation. sample_episodes (bool): If true, the update method interprets the batch_size as the number of episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ super(ActorCriticAgent, self).__init__( policy_spec=dict(deterministic=False), # Set policy to stochastic. name=kwargs.pop("name", "actor-critic-agent"), **kwargs) self.sample_episodes = sample_episodes # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True))) # The merger to merge inputs into one record Dict going into the memory. self.merger = DictMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance(self.memory, RingBuffer),\ "ERROR: Actor-critic memory must be ring-buffer for episode-handling." # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.loss_function = ActorCriticLossFunction( discount=self.discount, gae_lambda=gae_lambda, weight_entropy=weight_entropy) # Add all our sub-components to the core. sub_components = [ self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer ] self.root_component.add_components(*sub_components) # Define the Agent's (root-Component's) API. self.define_graph_api() if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"], build_options=dict(vf_optimizer=self.value_function_optimizer)) self.graph_built = True
def __init__(self, expert_margin=0.5, supervised_weight=1.0, double_q=True, dueling_q=True, huber_loss=False, n_step=1, shared_container_action_target=True, memory_spec=None, demo_memory_spec=None, demo_sample_ratio=0.2, store_last_memory_batch=False, store_last_q_table=False, **kwargs): # TODO Most of this is DQN duplicate but the way the loss function is instantiated, inheriting # from DQN does not work well. """ Args: expert_margin (float): The expert margin enforces a distance in Q-values between expert action and all other actions. supervised_weight (float): Indicates weight of the expert loss. double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use. demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use. store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in `self.last_memory_batch` for debugging purposes. Default: False. store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch (memory or external) in `self.last_q_table` for debugging purposes. Default: False. """ # Fix action-adapter before passing it to the super constructor. policy_spec = kwargs.pop("policy_spec", dict()) # Use a DuelingPolicy (instead of a basic Policy) if option is set. if dueling_q is True: policy_spec["type"] = "dueling-policy" # Give us some default state-value nodes. if "units_state_value_stream" not in policy_spec: policy_spec["units_state_value_stream"] = 128 super(DQFDAgent, self).__init__( policy_spec=policy_spec, name=kwargs.pop("name", "dqfd-agent"), **kwargs ) # Assert that the synch interval is a multiple of the update_interval. if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"]) ) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss self.demo_batch_size = int(demo_sample_ratio * self.update_spec['batch_size'] / (1.0 - demo_sample_ratio)) self.shared_container_action_target = shared_container_action_target # Debugging tools. self.store_last_memory_batch = store_last_memory_batch self.last_memory_batch = None self.store_last_q_table = store_last_q_table self.last_q_table = None # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank() reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update(dict( actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), time_step=int, use_exploration=bool, demo_batch_size=int, apply_demo_loss=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space )) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals") # The replay memory. self.memory = Memory.from_spec(memory_spec) # Cannot have same default name. demo_memory_spec["scope"] = "demo-memory" self.demo_memory = Memory.from_spec(demo_memory_spec) # The splitter for splitting up the records from the memories. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQFDLossFunction( expert_margin=expert_margin, supervised_weight=supervised_weight, discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, shared_container_action_target=shared_container_action_target, importance_weights=use_importance_weights, n_step=n_step ) # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy, self.target_policy, self.exploration, self.loss_function, self.optimizer ) # Define the Agent's (root-Component's) API. self.define_graph_api() if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent"): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list): Neural network specification for baseline. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function otpimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.flat_state_space = self.state_space.flatten() if isinstance( self.state_space, ContainerSpace) else None self.logger.info("Parsed state space definition: {}".format( self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank( False) self.flat_action_space = self.action_space.flatten() if isinstance( self.action_space, ContainerSpace) else None self.logger.info("Parsed action space definition: {}".format( self.action_space)) self.discount = discount self.build_options = {} # The agent's root-Component. self.root_component = Component(name=self.name, nesting_level=0) # Define the input-Spaces: # Tag the input-Space to `self.set_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict(states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space( self.state_space) self.preprocessing_required = preprocessing_spec is not None and len( preprocessing_spec) > 0 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info( "Parsed preprocessed-state space definition: {}".format( self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. policy_spec = policy_spec or dict() if "network_spec" not in policy_spec: policy_spec["network_spec"] = network_spec if "action_space" not in policy_spec: policy_spec["action_space"] = self.action_space self.policy_spec = policy_spec # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy.from_spec(self.policy_spec) # Done by default. self.policy.add_components(Synchronizable(), expose_apis="sync") # Create non-shared baseline network. self.value_function = None if value_function_spec is not None: self.value_function = ValueFunction( network_spec=value_function_spec) self.value_function.add_components(Synchronizable(), expose_apis="sync") self.vars_merger = ContainerMerger("policy", "vf", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", "vf", scope="variable-container-splitter") else: self.vars_merger = ContainerMerger("policy", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", scope="variable-container-splitter") self.internal_states_space = Space.from_spec(internal_states_space) # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" def factory_(i): if i < 2: return [] return tuple([[] for _ in range(i)]) self.states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.actions_buffer = defaultdict( partial(factory_, len(self.flat_action_space or []))) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: # Save spec in case agent needs to create more optimizers e.g. for baseline. self.optimizer_spec = optimizer_spec self.optimizer = Optimizer.from_spec(optimizer_spec) self.value_function_optimizer = None if self.value_function is not None: if value_function_optimizer_spec is None: vf_optimizer_spec = self.optimizer_spec else: vf_optimizer_spec = value_function_optimizer_spec vf_optimizer_spec["scope"] = "value-function-optimizer" self.value_function_optimizer = Optimizer.from_spec( vf_optimizer_spec) # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec) # type: GraphExecutor
def __init__(self, double_q=True, dueling_q=True, huber_loss=False, n_step=1, shared_container_action_target=True, memory_spec=None, store_last_memory_batch=False, store_last_q_table=False, **kwargs): """ Args: double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm. store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in `self.last_memory_batch` for debugging purposes. Default: False. store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch (memory or external) in `self.last_q_table` for debugging purposes. Default: False. """ # Fix action-adapter before passing it to the super constructor. policy_spec = kwargs.pop("policy_spec", dict()) # Use a DuelingPolicy (instead of a basic Policy) if option is set. if dueling_q is True: policy_spec["type"] = "dueling-policy" # Give us some default state-value nodes. if "units_state_value_stream" not in policy_spec: policy_spec["units_state_value_stream"] = 128 super(DQNAgent, self).__init__(policy_spec=policy_spec, name=kwargs.pop("name", "dqn-agent"), **kwargs) # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. #self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Assert that the synch interval is a multiple of the update_interval. if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss self.shared_container_action_target = shared_container_action_target # Debugging tools. self.store_last_memory_batch = store_last_memory_batch self.last_memory_batch = None self.store_last_q_table = store_last_q_table self.last_q_table = None # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update( dict( actions=self.action_space.with_batch_rank(), # weights will have a Space derived from the vars of policy. policy_weights="variables:{}".format(self.policy.scope), time_step=int, use_exploration=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space, )) if self.value_function is not None: self.input_spaces[ "value_function_weights"] = "variables:{}".format( self.value_function.scope), # The merger to merge inputs into one record Dict going into the memory. self.merger = DictMerger("states", "actions", "rewards", "next_states", "terminals") # The replay memory. self.memory = Memory.from_spec(memory_spec) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity,\ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\ format(self.observe_spec["buffer_size"], self.memory.capacity) # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQNLossFunction( discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, shared_container_action_target=shared_container_action_target, importance_weights=use_importance_weights, n_step=n_step) self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.target_policy, self.value_function, self.value_function_optimizer, # <- should both be None for DQN self.exploration, self.loss_function, self.optimizer, self.vars_merger, self.vars_splitter) # Define the Agent's (root-Component's) API. self.define_graph_api() # markup = get_graph_markup(self.graph_builder.root_component) # print(markup) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True
def __init__( self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="dqfd-agent", expert_margin=0.5, supervised_weight=1.0, double_q=True, dueling_q=True, huber_loss=False, n_step=1, shared_container_action_target=False, memory_spec=None, demo_memory_spec=None, demo_sample_ratio=0.2, ): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. expert_margin (float): The expert margin enforces a distance in Q-values between expert action and all other actions. supervised_weight (float): Indicates weight of the expert loss. double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use. demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use. """ # Fix action-adapter before passing it to the super constructor. # Use a DuelingPolicy (instead of a basic Policy) if option is set. if dueling_q is True: if policy_spec is None: policy_spec = {} policy_spec["type"] = "dueling-policy" # Give us some default state-value nodes. if "units_state_value_stream" not in policy_spec: policy_spec["units_state_value_stream"] = 128 super(DQFDAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, exploration_spec=exploration_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, auto_build=auto_build, name=name ) # Assert that the synch interval is a multiple of the update_interval. if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"]) ) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss self.expert_margin = expert_margin self.batch_size = self.update_spec["batch_size"] self.default_margins = np.asarray([self.expert_margin] * self.batch_size) self.demo_batch_size = int(demo_sample_ratio * self.update_spec["batch_size"] / (1.0 - demo_sample_ratio)) self.demo_margins = np.asarray([self.expert_margin] * self.demo_batch_size) self.shared_container_action_target = shared_container_action_target # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank() reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update(dict( actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), time_step=int, use_exploration=bool, demo_batch_size=int, apply_demo_loss=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, expert_margins=FloatBox(add_batch_rank=True), next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space )) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals") # The replay memory. self.memory = Memory.from_spec(memory_spec) # Cannot have same default name. demo_memory_spec["scope"] = "demo-memory" self.demo_memory = Memory.from_spec(demo_memory_spec) # The splitter for splitting up the records from the memories. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 self.use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQFDLossFunction( supervised_weight=supervised_weight, discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, shared_container_action_target=shared_container_action_target, importance_weights=self.use_importance_weights, n_step=n_step ) # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy, self.target_policy, self.exploration, self.loss_function, self.optimizer ) # Define the Agent's (root-Component's) API. self.define_graph_api() if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True
def __init__(self, clip_ratio=0.2, gae_lambda=1.0, clip_rewards=0.0, standardize_advantages=False, sample_episodes=True, weight_entropy=None, memory_spec=None, **kwargs): """ Args: clip_ratio (float): Clipping parameter for likelihood ratio. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range. standardize_advantages (bool): If true, standardize advantage values in update. sample_episodes (bool): If True, the update method interprets the batch_size as the number of episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ if "policy_spec" in kwargs: policy_spec = kwargs.pop("policy_spec") policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(PPOAgent, self).__init__( policy_spec=policy_spec, # Set policy to stochastic. name=kwargs.pop("name", "ppo-agent"), **kwargs) self.sample_episodes = sample_episodes # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:policy", value_function_weights="variables:value-function", deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True), apply_postprocessing=bool)) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance( self.memory, RingBuffer ), "ERROR: PPO memory must be ring-buffer for episode-handling!" # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity, \ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \ format(self.observe_spec["buffer_size"], self.memory.capacity) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = PPOLossFunction( clip_ratio=clip_ratio, standardize_advantages=standardize_advantages, weight_entropy=weight_entropy) self.iterations = self.update_spec["num_iterations"] self.sample_size = self.update_spec["sample_size"] self.batch_size = self.update_spec["batch_size"] # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.exploration, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.vars_merger, self.vars_splitter, self.gae_function) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get # multi-gpu-split. batch_size=self.update_spec["sample_size"], build_options=self.build_options) self.graph_built = True
def __init__( self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="dqn-agent", double_q=True, dueling_q=True, huber_loss=False, n_step=1, shared_container_action_target=True, memory_spec=None, ): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm. """ # Fix action-adapter before passing it to the super constructor. # Use a DuelingPolicy (instead of a basic Policy) if option is set. if dueling_q is True: policy_spec["type"] = "dueling-policy" # Give us some default state-value nodes. if "units_state_value_stream" not in policy_spec: policy_spec["units_state_value_stream"] = 128 super(DQNAgent, self).__init__(state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, exploration_spec=exploration_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, auto_build=auto_build, name=name) # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Assert that the synch interval is a multiple of the update_interval. if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss self.shared_container_action_target = shared_container_action_target # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update( dict( actions=self.action_space.with_batch_rank(), # Weights will have a Space derived from the vars of policy. policy_weights="variables:{}".format(self.policy.scope), use_exploration=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space, apply_postprocessing=bool)) if self.value_function is not None: self.input_spaces[ "value_function_weights"] = "variables:{}".format( self.value_function.scope), # The replay memory. self.memory = Memory.from_spec(memory_spec) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity,\ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\ format(self.observe_spec["buffer_size"], self.memory.capacity) # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQNLossFunction( discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, shared_container_action_target=shared_container_action_target, importance_weights=use_importance_weights, n_step=n_step) self.root_component.add_components( self.preprocessor, self.memory, self.splitter, self.policy, self.target_policy, self.value_function, self.value_function_optimizer, # <- should both be None for DQN self.exploration, self.loss_function, self.optimizer, self.vars_merger, self.vars_splitter) # Define the Agent's (root-Component's) API. self.define_graph_api() # markup = get_graph_markup(self.graph_builder.root_component) # print(markup) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True