def setup_preprocessor(preprocessing_spec, in_space): if preprocessing_spec is not None: # TODO move ingraph for python component assembly. preprocessing_spec = deepcopy(preprocessing_spec) in_space = deepcopy(in_space) # Store scopes (set if not given). scopes = [] for i, preprocessor in enumerate(preprocessing_spec): if "scope" not in preprocessor: preprocessor["scope"] = "preprocessor-{}".format(i) scopes.append(preprocessor["scope"]) # Set backend to python. preprocessor["backend"] = "python" processor_stack = PreprocessorStack(*preprocessing_spec, backend="python") build_space = in_space for sub_comp_scope in scopes: processor_stack.sub_components[ sub_comp_scope].create_variables( input_spaces=dict(preprocessing_inputs=build_space), action_space=None) build_space = processor_stack.sub_components[ sub_comp_scope].get_preprocessed_space(build_space) processor_stack.reset() return processor_stack else: return None
def setup_preprocessor(self, preprocessing_spec, in_space): if preprocessing_spec is not None: preprocessing_spec = deepcopy(preprocessing_spec) in_space = deepcopy(in_space) # Set scopes. scopes = [ preprocessor["scope"] for preprocessor in preprocessing_spec ] # Set backend to python. for spec in preprocessing_spec: spec["backend"] = "python" processor_stack = PreprocessorStack(*preprocessing_spec, backend="python") build_space = in_space for sub_comp_scope in scopes: processor_stack.sub_components[ sub_comp_scope].create_variables( input_spaces=dict(preprocessing_inputs=build_space), action_space=None) build_space = processor_stack.sub_components[ sub_comp_scope].get_preprocessed_space(build_space) processor_stack.reset() return processor_stack else: return None
def test_sac_agent_component_on_fake_env(self): config = config_from_path("configs/sac_component_for_fake_env_test.json") # Arbitrary state space, state should not be used in this example. state_space = FloatBox(shape=(2,)) continuous_action_space = FloatBox(low=-1.0, high=1.0) terminal_space = BoolBox(add_batch_rank=True) policy = Policy.from_spec(config["policy"], action_space=continuous_action_space) policy.add_components(Synchronizable(), expose_apis="sync") q_function = ValueFunction.from_spec(config["value_function"]) agent_component = SACAgentComponent( agent=None, policy=policy, q_function=q_function, preprocessor=PreprocessorStack.from_spec([]), memory=ReplayMemory.from_spec(config["memory"]), discount=config["discount"], initial_alpha=config["initial_alpha"], target_entropy=None, optimizer=AdamOptimizer.from_spec(config["optimizer"]), vf_optimizer=AdamOptimizer.from_spec(config["value_function_optimizer"], scope="vf-optimizer"), alpha_optimizer=None, q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0), num_q_functions=2 ) test = ComponentTest( component=agent_component, input_spaces=dict( states=state_space.with_batch_rank(), preprocessed_states=state_space.with_batch_rank(), actions=continuous_action_space.with_batch_rank(), rewards=FloatBox(add_batch_rank=True), next_states=state_space.with_batch_rank(), terminals=terminal_space, batch_size=int, preprocessed_s_prime=state_space.with_batch_rank(), importance_weights=FloatBox(add_batch_rank=True), preprocessed_next_states=state_space.with_batch_rank(), deterministic=bool, weights="variables:{}".format(policy.scope), # TODO: how to provide the space for multiple component variables? # q_weights=Dict( # q_0="variables:{}".format(q_function.scope), # q_1="variables:{}".format(agent_component._q_functions[1].scope), # ) ), action_space=continuous_action_space, build_kwargs=dict( optimizer=agent_component._optimizer, build_options=dict( vf_optimizer=agent_component.vf_optimizer, ), ) ) policy_loss = [] vf_loss = [] # This test simulates an env that always requires actions to be close to the max-pdf # value of a loc=0.5, scale=0.2 normal, regardless of any state inputs. # The component should learn to produce actions like that (close to 0.5). true_mean = 0.5 target_dist = stats.norm(loc=true_mean, scale=0.2) batch_size = 100 for _ in range(5000): action_sample = continuous_action_space.sample(batch_size) rewards = target_dist.pdf(action_sample) result = test.test(("update_from_external_batch", [ state_space.sample(batch_size), action_sample, rewards, [True] * batch_size, state_space.sample(batch_size), [1.0] * batch_size # importance ])) policy_loss.append(result["actor_loss"]) vf_loss.append(result["critic_loss"]) self.assertTrue(np.mean(policy_loss[:100]) > np.mean(policy_loss[-100:])) self.assertTrue(np.mean(vf_loss[:100]) > np.mean(vf_loss[-100:])) action_sample = np.linspace(-1, 1, batch_size) q_values = test.test(("get_q_values", [state_space.sample(batch_size), action_sample])) for q_val in q_values: q_val = q_val.flatten() np.testing.assert_allclose(q_val, target_dist.pdf(action_sample), atol=0.2) action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False])) action_sample = action_sample.flatten() np.testing.assert_allclose(np.mean(action_sample), true_mean, atol=0.1)
def test_with_final_eval(self): """ Tests if apex can learn a simple environment using a single worker, thus replicating DQN. """ env_spec = dict(type="openai", gym_env="CartPole-v0") agent_config = config_from_path("configs/apex_agent_cartpole.json") # Use n-step adjustments. agent_config["execution_spec"]["ray_spec"]["worker_spec"][ "n_step_adjustment"] = 3 agent_config["execution_spec"]["ray_spec"]["apex_replay_spec"][ "n_step_adjustment"] = 3 agent_config["n_step"] = 3 executor = ApexExecutor( environment_spec=env_spec, agent_config=agent_config, ) # Define executor, test assembly. print("Successfully created executor.") # Executes actual workload. result = executor.execute_workload( workload=dict(num_timesteps=20000, report_interval=1000, report_interval_min_seconds=1)) print("Finished executing workload:") print(result) # Get agent. agent = executor.local_agent preprocessing_spec = agent_config["preprocessing_spec"] # Create env. env = OpenAIGymEnv.from_spec(env_spec) if preprocessing_spec is not None: preprocessing_spec = deepcopy(preprocessing_spec) in_space = env.state_space.with_batch_rank() in_space = deepcopy(in_space) # Set scopes. scopes = [ preprocessor["scope"] for preprocessor in preprocessing_spec ] # Set backend to python. for spec in preprocessing_spec: spec["backend"] = "python" processor_stack = PreprocessorStack(*preprocessing_spec, backend="python") build_space = in_space for sub_comp_scope in scopes: processor_stack.sub_components[ sub_comp_scope].create_variables( input_spaces=dict(preprocessing_inputs=build_space), action_space=None) build_space = processor_stack.sub_components[ sub_comp_scope].get_preprocessed_space(build_space) processor_stack.reset() else: processor_stack = None ep_rewards = [] print("finished learning, starting eval") for _ in range(10): state = env.reset() terminal = False ep_reward = 0 while not terminal: state = agent.state_space.force_batch(state) if processor_stack is not None: state = processor_stack.preprocess(state) actions = agent.get_action(states=state, use_exploration=False, apply_preprocessing=False) next_state, step_reward, terminal, info = env.step( actions=actions[0]) ep_reward += step_reward state = next_state if terminal: ep_rewards.append(ep_reward) break print("Eval episode rewards:") print(ep_rewards)
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent"): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list): Neural network specification for baseline. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function otpimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.flat_state_space = self.state_space.flatten() if isinstance( self.state_space, ContainerSpace) else None self.logger.info("Parsed state space definition: {}".format( self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank( False) self.flat_action_space = self.action_space.flatten() if isinstance( self.action_space, ContainerSpace) else None self.logger.info("Parsed action space definition: {}".format( self.action_space)) self.discount = discount self.build_options = {} # The agent's root-Component. self.root_component = Component(name=self.name, nesting_level=0) # Define the input-Spaces: # Tag the input-Space to `self.set_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict(states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space( self.state_space) self.preprocessing_required = preprocessing_spec is not None and len( preprocessing_spec) > 0 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info( "Parsed preprocessed-state space definition: {}".format( self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. policy_spec = policy_spec or dict() if "network_spec" not in policy_spec: policy_spec["network_spec"] = network_spec if "action_space" not in policy_spec: policy_spec["action_space"] = self.action_space self.policy_spec = policy_spec # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy.from_spec(self.policy_spec) # Done by default. self.policy.add_components(Synchronizable(), expose_apis="sync") # Create non-shared baseline network. self.value_function = None if value_function_spec is not None: self.value_function = ValueFunction( network_spec=value_function_spec) self.value_function.add_components(Synchronizable(), expose_apis="sync") self.vars_merger = ContainerMerger("policy", "vf", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", "vf", scope="variable-container-splitter") else: self.vars_merger = ContainerMerger("policy", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", scope="variable-container-splitter") self.internal_states_space = Space.from_spec(internal_states_space) # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" def factory_(i): if i < 2: return [] return tuple([[] for _ in range(i)]) self.states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.actions_buffer = defaultdict( partial(factory_, len(self.flat_action_space or []))) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: # Save spec in case agent needs to create more optimizers e.g. for baseline. self.optimizer_spec = optimizer_spec self.optimizer = Optimizer.from_spec(optimizer_spec) self.value_function_optimizer = None if self.value_function is not None: if value_function_optimizer_spec is None: vf_optimizer_spec = self.optimizer_spec else: vf_optimizer_spec = value_function_optimizer_spec vf_optimizer_spec["scope"] = "value-function-optimizer" self.value_function_optimizer = Optimizer.from_spec( vf_optimizer_spec) # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec) # type: GraphExecutor
def test_sac_agent_component_functionality(self): config = config_from_path( "configs/sac_component_for_fake_env_test.json") # Arbitrary state space, state should not be used in this example. state_space = FloatBox(shape=(8, )) continuous_action_space = FloatBox(shape=(1, ), low=-2.0, high=2.0) terminal_space = BoolBox(add_batch_rank=True) rewards_space = FloatBox(add_batch_rank=True) policy = Policy.from_spec(config["policy"], action_space=continuous_action_space) policy.add_components(Synchronizable(), expose_apis="sync") q_function = ValueFunction.from_spec(config["value_function"]) agent_component = SACAgentComponent( agent=None, policy=policy, q_function=q_function, preprocessor=PreprocessorStack.from_spec([]), memory=ReplayMemory.from_spec(config["memory"]), discount=config["discount"], initial_alpha=config["initial_alpha"], target_entropy=None, optimizer=AdamOptimizer.from_spec(config["optimizer"]), vf_optimizer=AdamOptimizer.from_spec( config["value_function_optimizer"], scope="vf-optimizer"), alpha_optimizer=None, q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0), num_q_functions=2) test = ComponentTest( component=agent_component, input_spaces=dict( states=state_space.with_batch_rank(), preprocessed_states=state_space.with_batch_rank(), env_actions=continuous_action_space.with_batch_rank(), actions=continuous_action_space.with_batch_rank(), rewards=rewards_space, next_states=state_space.with_batch_rank(), terminals=terminal_space, batch_size=int, preprocessed_s_prime=state_space.with_batch_rank(), importance_weights=FloatBox(add_batch_rank=True), preprocessed_next_states=state_space.with_batch_rank(), deterministic=bool, weights="variables:{}".format(policy.scope), # TODO: how to provide the space for multiple component variables? #q_weights=Dict( # q_0="variables:{}".format(q_function.scope), # q_1="variables:{}".format(agent_component._q_functions[1].scope), #) ), action_space=continuous_action_space, build_kwargs=dict( optimizer=agent_component._optimizer, build_options=dict( vf_optimizer=agent_component.vf_optimizer, ), )) batch_size = 10 action_sample = continuous_action_space.with_batch_rank().sample( batch_size) rewards = rewards_space.sample(batch_size) # Check, whether an update runs ok. result = test.test(( "update_from_external_batch", [ state_space.sample(batch_size), action_sample, rewards, [True] * batch_size, state_space.sample(batch_size), [1.0] * batch_size # importance ])) self.assertTrue(result["actor_loss"].dtype == np.float32) self.assertTrue(result["critic_loss"].dtype == np.float32) action_sample = np.linspace(-1, 1, batch_size).reshape((batch_size, 1)) q_values = test.test( ("get_q_values", [state_space.sample(batch_size), action_sample])) for q_val in q_values: self.assertTrue(q_val.dtype == np.float32) self.assertTrue(q_val.shape == (batch_size, 1)) action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False])) self.assertTrue(action_sample.dtype == np.float32) self.assertTrue(action_sample.shape == (batch_size, 1))
def __init__( self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, action_adapter_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent" ): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. action_adapter_spec (Optional[dict,ActionAdapter]): The spec-dict for the ActionAdapter Component or the ActionAdapter object itself. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.logger.info("Parsed state space definition: {}".format(self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank(False) self.logger.info("Parsed action space definition: {}".format(self.action_space)) self.discount = discount # The agent's root-Component. self.root_component = Component(name=self.name) # Define the input-Spaces: # Tag the input-Space to `self.set_policy_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict( states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space(self.state_space) self.preprocessing_required = preprocessing_spec is not None and len(preprocessing_spec) > 1 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info("Parsed preprocessed-state space definition: {}".format(self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. self.neural_network = None if network_spec is not None: self.neural_network = NeuralNetwork.from_spec(network_spec) self.action_adapter_spec = action_adapter_spec self.internal_states_space = internal_states_space # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None # The action adapter mapping raw NN output to (shaped) actions. action_adapter_dict = dict(action_space=self.action_space) if self.action_adapter_spec is None: self.action_adapter_spec = action_adapter_dict else: self.action_adapter_spec.update(action_adapter_dict) # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy( network_spec=self.neural_network, action_adapter_spec=self.action_adapter_spec ) self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" self.states_buffer = defaultdict(list) self.actions_buffer = defaultdict(list) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict(list) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) if self.observe_spec["buffer_enabled"]: self.reset_env_buffers() # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: self.optimizer = Optimizer.from_spec(optimizer_spec) #get_optimizer_from_device_strategy( #optimizer_spec, self.execution_spec.get("device_strategy", 'default') # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec ) # type: GraphExecutor