def test_environment_stepper_on_2x2_grid_world(self): preprocessor_spec = [dict( type="reshape", flatten=True, flatten_categories=self.grid_world_2x2_action_space.num_categories )] network_spec = config_from_path("configs/test_simple_nn.json") # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal). network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1], [-0.2, 0.2], [-0.4, 0.2]] network_spec["layers"][0]["biases_spec"] = False exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_adapter_spec=dict( weights_spec=[[0.1, -0.5, 0.5, 0.1], [0.4, 0.2, -0.2, 0.2]], biases_spec=False ), action_space=self.grid_world_2x2_action_space, deterministic=True), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="grid_world", world="2x2"), actor_component_spec=actor_component, state_space=self.grid_world_2x2_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.grid_world_2x2_action_probs_space, num_steps=5 ) test = ComponentTest( component=environment_stepper, action_space=self.grid_world_2x2_action_space, ) # Step 5 times through the Env and collect results. expected = ( np.array([False, True, False, True, False]), # t_ np.array([0, 1, 0, 1, 0, 1]), # s' (raw) np.array([[0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299]], dtype=np.float32) ) out = test.test("step", expected_outputs=expected, decimals=2) print(out) # Step again, check whether stitching of states/etc.. works. expected = ( np.array([True, False, True, False, True]), # t_ np.array([1, 0, 1, 0, 1, 0]), # s' (raw) np.array([[0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825]], dtype=np.float32) ) out = test.test("step", expected_outputs=expected) print(out) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_on_deterministic_env_with_returning_action_probs(self): preprocessor_spec = [dict(type="divide", divisor=2)] network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=6), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=3 ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry) policy_scope = "environment-stepper/actor-component/policy/" weights_hid = weights[policy_scope+"test-network/hidden-layer/dense/kernel"] biases_hid = weights[policy_scope+"test-network/hidden-layer/dense/bias"] weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. expected = ( # t_ np.array([False, False, False]), # s' (raw) np.array([[0.0], [1.0], [2.0], [3.0]]), # action probs np.array([ softmax(dense_layer(dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action)) ]) ) test.test("step", expected_outputs=expected, decimals=3) # Step again, check whether stitching of states/etc.. works. expected = ( np.array([False, False, True]), np.array([[3.0], [4.0], [5.0], [0.0]]), # s' (raw) np.array([ softmax(dense_layer(dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action)) ]) ) test.test("step", expected_outputs=expected, decimals=3) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(self): internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,))) preprocessor_spec = [dict(type="multiply", factor=0.1)] network_spec = config_from_path("configs/test_lstm_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=3), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", internal_states_space=internal_states_space, add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=4, ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry) policy_scope = "environment-stepper/actor-component/policy/" weights_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/kernel"] biases_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/bias"] weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm) lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1]) lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1]) lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1]) expected = ( np.array([False, False, True, False]), np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]), # s' (raw) np.array([ softmax(dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)), ]), # action probs # internal states ( np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])), np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]])) ) ) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_to_find_out_what_breaks_specifiable_server_start_via_thread_pools( self): env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only for image and prev-action channel). dict( type="dict-preprocessor-stack", preprocessors=dict( # The images from the env are divided by 255. RGB_INTERLEAVED=[dict(type="divide", divisor=255)], # The prev. action/reward from the env must be flattened/bumped-up-to-(1,). previous_action=[ dict(type="reshape", flatten=True, flatten_categories=action_space.num_categories) ], previous_reward=[ dict(type="reshape", new_shape=(1, )), dict(type="convert_type", to_dtype="float32") ], )), # Policy spec. dict(network_spec=LargeIMPALANetwork(), action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space, num_steps=100, # Add both prev-action and -reward into the state sent through the network. add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=self.action_probs_space) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset")
def test_environment_stepper_on_pong(self): environment_spec = dict(type="openai-gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_space=action_space, **agent_config["policy_spec"]), agent_config["exploration_spec"]) environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float", add_reward=True, num_steps=self.time_steps) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Step 30 times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.monotonic() out = test.test("step") time_end = time.monotonic() print("Done running {} steps in env-stepper env in {}sec.".format( environment_stepper.num_steps, time_end - time_start)) # Check types of outputs. self.assertTrue(isinstance( out, DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. self.assertTrue(out[0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1].min() >= 0) # make sure we have pixels self.assertTrue(out[1].max() <= 255) self.assertTrue(out[2].dtype == np.float32) # rewards self.assertTrue(out[2].min() >= -1.0) # -1.0 to 1.0 self.assertTrue(out[2].max() <= 1.0) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_on_2x2_grid_world_returning_actions_and_rewards( self): preprocessor_spec = [ dict(type="reshape", flatten=True, flatten_categories=self.grid_world_2x2_action_space. num_categories) ] network_spec = config_from_path("configs/test_simple_nn.json") # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal). network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1], [-0.2, 0.2], [-0.4, 0.2]] network_spec["layers"][0]["biases_spec"] = False exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_adapter_spec=dict(weights_spec=[[0.1, -0.5, 0.5, 0.1], [0.4, 0.2, -0.2, 0.2]], biases_spec=False), action_space=self.grid_world_2x2_action_space, deterministic=True), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="grid_world", world="2x2"), actor_component_spec=actor_component, state_space=self.grid_world_2x2_state_space, reward_space="float32", add_action=True, add_reward=True, num_steps=5) test = ComponentTest( component=environment_stepper, action_space=self.grid_world_2x2_action_space, ) # Step 5 times through the Env and collect results. expected = ( np.array([False, True, False, True, False]), # t_ np.array([0, 1, 0, 1, 0, 1]), # s' (raw) np.array([2, 1, 2, 1, 2]), # actions taken np.array([-1.0, 1.0, -1.0, 1.0, -1.0]) # rewards ) out = test.test("step", expected_outputs=expected, decimals=2) print(out) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_on_deterministic_env(self): preprocessor_spec = None network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=5), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", num_steps=3) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) # Reset the stepper. test.test("reset") # Step 3 times through the Env and collect results. expected = ( None, ( np.array([True, False, False, False]), # t_ np.array([[0.0], [1.0], [2.0], [3.0]]), # s' (raw) )) test.test("step", expected_outputs=expected) # Step again, check whether stitching of states/etc.. works. expected = ( None, ( np.array([False, False, True, False]), # t_ np.array([[3.0], [4.0], [0.0], [1.0]]), # s' (raw) )) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, num_workers=1, worker_sample_size=100, dynamic_batching=False, visualize=False, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. num_workers (int): How many actors (workers) should be run in separate threads. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. dynamic_batching (bool): Whether to use the deepmind's custom dynamic batching op for wrapping the optimizer's step call. The batcher.so file must be compiled for this to work (see Docker file). Default: False. visualize (Union[int,bool]): Whether and how many workers to visualize. Default: False (no visualization). """ # Now that we fixed the Agent's spec, call the super constructor. super(SingleIMPALAAgent, self).__init__( type="single", discount=discount, architecture=architecture, fifo_queue_spec=fifo_queue_spec, environment_spec=environment_spec, feed_previous_action_through_nn=feed_previous_action_through_nn, feed_previous_reward_through_nn=feed_previous_reward_through_nn, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, worker_sample_size=worker_sample_size, name=kwargs.pop("name", "impala-single-agent"), **kwargs) self.dynamic_batching = dynamic_batching self.num_workers = num_workers self.visualize = visualize # If we use dynamic batching, wrap the dynamic batcher around the policy's graph_fn that we # actually call below during our build. if self.dynamic_batching: self.policy = DynamicBatchingPolicy(policy_spec=self.policy, scope="") self.env_output_splitter = ContainerSplitter( tuple_length=3 if self.has_rnn is False else 4, scope="env-output-splitter") self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys( ) if isinstance(self.state_space, Dict) else "dummy"), scope="states-dict-splitter") self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Slice some data from the EnvStepper (e.g only first internal states are needed). if self.has_rnn: internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) else: internal_states_slicer = None self.transposer = Transpose(scope="transposer") # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_steppers = list() for i in range(self.num_workers): environment_spec_ = copy.deepcopy(environment_spec) if self.visualize is True or (isinstance(self.visualize, int) and i + 1 <= self.visualize): environment_spec_["visualize"] = True # Force worker_sample_size for IMPALA NNs (LSTM) in env-stepper to be 1. policy_spec = copy.deepcopy(self.policy_spec) if isinstance(policy_spec, dict) and isinstance(policy_spec["network_spec"], dict) and \ "type" in policy_spec["network_spec"] and "IMPALANetwork" in policy_spec["network_spec"]["type"]: policy_spec["network_spec"]["worker_sample_size"] = 1 env_stepper = EnvironmentStepper( environment_spec=environment_spec_, actor_component_spec=ActorComponent( preprocessor_spec=self.preprocessing_spec, policy_spec=policy_spec, exploration_spec=self.exploration_spec), state_space=self.state_space.with_batch_rank(), action_space=self.action_space.with_batch_rank(), reward_space=float, internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_action=not self.feed_previous_action_through_nn, add_reward=not self.feed_previous_reward_through_nn, add_previous_action_to_state=self. feed_previous_action_through_nn, add_previous_reward_to_state=self. feed_previous_reward_through_nn, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space), scope="env-stepper-{}".format(i)) if self.dynamic_batching: env_stepper.actor_component.policy.parent_component = None env_stepper.actor_component.policy = DynamicBatchingPolicy( policy_spec=env_stepper.actor_component.policy, scope="") env_stepper.actor_component.add_components( env_stepper.actor_component.policy) self.environment_steppers.append(env_stepper) # Create the QueueRunners (one for each env-stepper). self.queue_runner = QueueRunner( self.fifo_queue, "step", -1, # -1: Take entire return value of API-method `step` as record to insert. self.env_output_splitter, self.fifo_input_merger, internal_states_slicer, *self.environment_steppers) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.queue_runner, self.transposer, self.staging_area, self.preprocessor, self.states_dict_splitter, self.policy, self.loss_function, self.optimizer ] # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api() if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, worker_sample_size=100, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. Keyword Args: type (str): One of "single", "actor" or "learner". Default: "single". """ type_ = kwargs.pop("type", "single") assert type_ in ["single", "actor", "learner"] self.type = type_ self.worker_sample_size = worker_sample_size # Network-spec by default is a "large architecture" IMPALA network. self.network_spec = kwargs.pop( "network_spec", dict( type= "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork" .format("Large" if architecture == "large" else "Small"))) if isinstance(self.network_spec, dict) and "type" in self.network_spec and \ "IMPALANetwork" in self.network_spec["type"]: self.network_spec = default_dict( self.network_spec, dict(worker_sample_size=1 if self.type == "actor" else self.worker_sample_size + 1)) # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need. self.exploration_spec = kwargs.pop("exploration_spec", None) optimizer_spec = kwargs.pop("optimizer_spec", None) observe_spec = kwargs.pop("observe_spec", None) self.feed_previous_action_through_nn = feed_previous_action_through_nn self.feed_previous_reward_through_nn = feed_previous_reward_through_nn # Run everything in a single process. if self.type == "single": environment_spec = environment_spec or self.default_environment_spec update_spec = kwargs.pop("update_spec", None) # Actors won't need to learn (no optimizer needed in graph). elif self.type == "actor": optimizer_spec = None update_spec = kwargs.pop("update_spec", dict(do_updates=False)) environment_spec = environment_spec or self.default_environment_spec # Learners won't need to explore (act) or observe (insert into Queue). else: observe_spec = None update_spec = kwargs.pop("update_spec", None) environment_spec = None # Add previous-action/reward preprocessors to env-specific preprocessor spec. # TODO: remove this empty hard-coded preprocessor. self.preprocessing_spec = kwargs.pop( "preprocessing_spec", dict( type="dict-preprocessor-stack", preprocessors=dict( # Flatten actions. previous_action=[ dict(type="reshape", flatten=True, flatten_categories=kwargs.get( "action_space").num_categories) ], # Bump reward and convert to float32, so that it can be concatenated by the Concat layer. previous_reward=[dict(type="reshape", new_shape=(1, ))]))) # Limit communication in distributed mode between each actor and the learner (never between actors). execution_spec = kwargs.pop("execution_spec", None) if execution_spec is not None and execution_spec.get( "mode") == "distributed": default_dict( execution_spec["session_config"], dict(type="monitored-training-session", allow_soft_placement=True, device_filters=["/job:learner/task:0"] + ([ "/job:actor/task:{}".format( execution_spec["distributed_spec"]["task_index"]) ] if self.type == "actor" else ["/job:learner/task:0"]))) # If Actor, make non-chief in either case (even if task idx == 0). if self.type == "actor": execution_spec["distributed_spec"]["is_chief"] = False # Hard-set device to the CPU for actors. execution_spec["device_strategy"] = "custom" execution_spec[ "default_device"] = "/job:{}/task:{}/cpu".format( self.type, execution_spec["distributed_spec"]["task_index"]) self.policy_spec = kwargs.pop("policy_spec", dict()) # TODO: Create some auto-setting based on LSTM inside the NN. default_dict( self.policy_spec, dict(type="shared-value-function-policy", deterministic=False, reuse_variable_scope="shared-policy", action_space=kwargs.get("action_space"))) # Now that we fixed the Agent's spec, call the super constructor. super(IMPALAAgent, self).__init__(discount=discount, preprocessing_spec=self.preprocessing_spec, network_spec=self.network_spec, policy_spec=self.policy_spec, exploration_spec=self.exploration_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, execution_spec=execution_spec, name=kwargs.pop( "name", "impala-{}-agent".format(self.type)), **kwargs) # Always use 1st learner as the parameter server for all policy variables. if self.execution_spec["mode"] == "distributed" and self.execution_spec[ "distributed_spec"]["cluster_spec"]: self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu"))) # Check whether we have an RNN. self.has_rnn = self.policy.neural_network.has_rnn() # Check, whether we are running with GPU. self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \ self.execution_spec["gpu_spec"]["num_gpus"] > 0 # Some FIFO-queue specs. self.fifo_queue_keys = ["terminals", "states"] + \ (["actions"] if not self.feed_previous_action_through_nn else []) + \ (["rewards"] if not self.feed_previous_reward_through_nn else []) + \ ["action_probs"] + \ (["initial_internal_states"] if self.has_rnn else []) # Define FIFO record space. # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain # num-steps items. self.fifo_record_space = Dict( { "terminals": bool, "action_probs": FloatBox(shape=(self.action_space.num_categories, )), }, add_batch_rank=False, add_time_rank=self.worker_sample_size) self.fifo_record_space["states"] = self.state_space.with_time_rank( self.worker_sample_size + 1) # Add action and rewards to state or do they have an extra channel? if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_action"] = \ self.action_space.with_time_rank(self.worker_sample_size + 1) else: self.fifo_record_space[ "actions"] = self.action_space.with_time_rank( self.worker_sample_size) if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_reward"] = FloatBox( add_time_rank=self.worker_sample_size + 1) else: self.fifo_record_space["rewards"] = FloatBox( add_time_rank=self.worker_sample_size) if self.has_rnn: self.fifo_record_space[ "initial_internal_states"] = self.internal_states_space.with_time_rank( False) # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue). self.fifo_queue = FIFOQueue.from_spec( fifo_queue_spec or dict(capacity=1), reuse_variable_scope="shared-fifo-queue", only_insert_single_records=True, record_space=self.fifo_record_space, device="/job:learner/task:0/cpu" if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"] else None) # Remove `states` key from input_spaces: not needed. del self.input_spaces["states"] # Add all our sub-components to the core. if self.type == "single": pass elif self.type == "actor": # No learning, no loss function. self.loss_function = None # A Dict Splitter to split things from the EnvStepper. self.env_output_splitter = ContainerSplitter( tuple_length=4, scope="env-output-splitter") self.states_dict_splitter = None # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent(self.preprocessor, self.policy, self.exploration), state_space=self.state_space.with_batch_rank(), reward_space= float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space)) sub_components = [ self.environment_stepper, self.env_output_splitter, self.internal_states_slicer, self.fifo_input_merger, self.fifo_queue ] # Learner. else: self.environment_stepper = None # A Dict splitter to split up items from the queue. self.fifo_input_merger = None self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys()), scope="states-dict-splitter") self.internal_states_slicer = None self.transposer = Transpose( scope="transposer", device=dict(ops="/job:learner/task:0/cpu")) self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn, device="/job:learner/task:0/gpu") self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu", ops="/job:learner/task:0/gpu"))) for component in [ self.staging_area, self.preprocessor, self.optimizer ]: component.propagate_sub_component_properties( dict(device="/job:learner/task:0/gpu")) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.states_dict_splitter, self.transposer, self.staging_area, self.preprocessor, self.policy, self.loss_function, self.optimizer ] if self.type != "single": # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api(*sub_components) if self.type != "single" and self.auto_build: if self.type == "learner": build_options = dict( build_device_context="/job:learner/task:0/cpu", pin_global_variable_device="/job:learner/task:0/cpu") self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options) else: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op if self.type == "actor": self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \ out_op_columns[0].op_records[0].op
def test_environment_stepper_on_deepmind_lab(self): try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("DeepmindLab not installed: Skipping this test case.") return env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only divide and flatten the image). [{ "type": "divide", "divisor": 255 }, { "type": "reshape", "flatten": True }], # Policy spec. dict(network_spec="../configs/test_lstm_nn.json", action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space_test_lstm, num_steps=1000, # Add both prev-action and -reward into the state sent through the network. #add_previous_action_to_state=True, #add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True)) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset") # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.monotonic() steps = 10 out = None for _ in range(steps): out = test.test("step") time_total = time.monotonic() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)" .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[0].dtype == np.float32) #self.assertTrue(out[0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[0].max() <= 1.0) #self.assertTrue(out[1].dtype == np.int32) # actions #self.assertTrue(out[2].dtype == np.float32) # rewards #self.assertTrue(out[0].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) # action probs (test whether sum to one). #self.assertTrue(out[1][6].dtype == np.float32) #self.assertTrue(out[1][6].min() >= 0.0) #self.assertTrue(out[1][6].max() <= 1.0) #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False), # np.ones(shape=(environment_stepper.num_steps,)), decimals=4) # internal states (c- and h-state) self.assertTrue(out[3][0].dtype == np.float32) self.assertTrue(out[3][1].dtype == np.float32) self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3)) self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3)) # Check whether episode returns match single rewards (including terminal signals). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[0][i] # self.assertAlmostEqual(episode_returns, out[3][i]) # # Terminal: Reset for next step. # if out[4][i] is np.bool_(True): # episode_returns = 0.0 test.terminate()
def test_environment_stepper_on_pong(self): environment_spec = dict(type="openai_gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_adapter_spec=agent_config["action_adapter_spec"], action_space=action_space), agent_config["exploration_spec"]) environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float", add_reward=True, num_steps=self.time_steps) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Step 30 times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. # Reset the stepper. test.test("reset") time_start = time.monotonic() out = test.test("step") time_end = time.monotonic() print("Done running {} steps in env-stepper env in {}sec.".format( environment_stepper.num_steps, time_end - time_start)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[1][0].dtype == np.float32) # preprocessed states #self.assertTrue(out[1][0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[1][0].max() <= 1.0) #self.assertTrue(out[1][1].dtype == np.int32) # actions #self.assertTrue(out[1][2].dtype == np.float32) # rewards #self.assertTrue(out[1][3].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) self.assertTrue(out[1][2].dtype == np.float32) # rewards self.assertTrue(out[1][2].min() >= -1.0) # -1.0 to 1.0 self.assertTrue(out[1][2].max() <= 1.0) # Check whether episode returns match single rewards (including resetting after each terminal signal). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[2][i] # self.assertAlmostEqual(episode_returns, out[1][3][i]) # # Terminal: Reset accumulated episode-return before next step. # if out[1][4][i] is np.bool_(True): # episode_returns = 0.0 # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_component_with_large_impala_architecture( self): env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only for image and prev-action channel). dict( type="dict-preprocessor-stack", preprocessors=dict( ## The images from the env are divided by 255. #RGB_INTERLEAVED=[dict(type="divide", divisor=255)], # The prev. action/reward from the env must be flattened/bumped-up-to-(1,). previous_action=[ dict(type="reshape", flatten=True, flatten_categories=action_space.num_categories) ], previous_reward=[ dict(type="reshape", new_shape=(1, )), dict(type="convert_type", to_dtype="float32") ], )), # Policy spec. dict(network_spec=LargeIMPALANetwork(), action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space, num_steps=100, # Add both prev-action and -reward into the state sent through the network. add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=self.action_probs_space) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset") # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.perf_counter() steps = 10 for _ in range(steps): out = test.test("step") time_total = time.perf_counter() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)." .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(isinstance( out, DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. self.assertTrue(out[0]["INSTR"].dtype == np.object) self.assertTrue(out[0]["RGB_INTERLEAVED"].dtype == np.float32) self.assertTrue(out[0]["RGB_INTERLEAVED"].min() >= 0.0) # make sure we have pixels / 255 self.assertTrue(out[0]["RGB_INTERLEAVED"].max() <= 1.0) self.assertTrue(out[1].dtype == np.int32) # actions self.assertTrue(out[2].dtype == np.float32) # rewards self.assertTrue(out[3].dtype == np.float32) # episode return self.assertTrue(out[4].dtype == np.bool_) # next-state is terminal? self.assertTrue(out[5]["INSTR"].dtype == np.object) # next state (raw, not preprocessed) self.assertTrue(out[5]["RGB_INTERLEAVED"].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue( out[5]["RGB_INTERLEAVED"].min() >= 0) # make sure we have pixels self.assertTrue(out[5]["RGB_INTERLEAVED"].max() <= 255) # action probs (test whether sum to one). self.assertTrue(out[6].dtype == np.float32) self.assertTrue(out[6].min() >= 0.0) self.assertTrue(out[6].max() <= 1.0) recursive_assert_almost_equal( out[6].sum(axis=-1, keepdims=False), np.ones(shape=(environment_stepper.num_steps, )), decimals=4) # internal states (c- and h-state) self.assertTrue(out[7][0].dtype == np.float32) self.assertTrue(out[7][1].dtype == np.float32) self.assertTrue(out[7][0].shape == (environment_stepper.num_steps, 256)) self.assertTrue(out[7][1].shape == (environment_stepper.num_steps, 256)) # Check whether episode returns match single rewards (including terminal signals). episode_returns = 0.0 for i in range(environment_stepper.num_steps): episode_returns += out[2][i] self.assertAlmostEqual(episode_returns, out[3][i]) # Terminal: Reset for next step. if out[4][i] is np.bool_(True): episode_returns = 0.0 test.terminate()
def test_environment_stepper_component_with_large_impala_architecture( self): try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("DeepmindLab not installed: Skipping this test case.") return worker_sample_size = 100 env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = DeepmindLabEnv.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only for image and prev-action channel). dict( type="dict-preprocessor-stack", preprocessors=dict( # The prev. action/reward from the env must be flattened/bumped-up-to-(1,). previous_action=[ dict(type="reshape", flatten=True, flatten_categories=action_space.num_categories) ], previous_reward=[ dict(type="reshape", new_shape=(1, )), dict(type="convert_type", to_dtype="float32") ], )), # Policy spec. worker_sample_size=1 as its an actor network. dict(network_spec=LargeIMPALANetwork(worker_sample_size=1), action_space=action_space)) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space, num_steps=worker_sample_size, # Add both prev-action and -reward into the state sent through the network. add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=self.action_probs_space) test = ComponentTest(component=environment_stepper, action_space=action_space, execution_spec=dict(disable_monitoring=True)) environment_stepper.environment_server.start_server() # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.perf_counter() steps = 10 for _ in range(steps): out = test.test("step") time_total = time.perf_counter() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)." .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(isinstance( out, DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. self.assertTrue(out[0].dtype == np.bool_) # next-state is terminal? self.assertTrue(out[1]["INSTR"].dtype == np.object) self.assertTrue(out[1]["RGB_INTERLEAVED"].dtype == np.uint8) self.assertTrue( out[1]["RGB_INTERLEAVED"].shape == (worker_sample_size + 1, ) + state_space["RGB_INTERLEAVED"].shape) self.assertTrue( out[1]["RGB_INTERLEAVED"].min() >= 0) # make sure we have pixels self.assertTrue(out[1]["RGB_INTERLEAVED"].max() <= 255) self.assertTrue(out[1]["previous_action"].dtype == np.int32) # actions self.assertTrue( out[1]["previous_action"].shape == (worker_sample_size + 1, )) self.assertTrue( out[1]["previous_reward"].dtype == np.float32) # rewards self.assertTrue( out[1]["previous_reward"].shape == (worker_sample_size + 1, )) # action probs (test whether sum to one). self.assertTrue(out[2].dtype == np.float32) self.assertTrue(out[2].shape == (100, action_space.num_categories)) self.assertTrue(out[2].min() >= 0.0) self.assertTrue(out[2].max() <= 1.0) recursive_assert_almost_equal(out[2].sum(axis=-1, keepdims=False), np.ones(shape=(worker_sample_size, )), decimals=4) # internal states (c- and h-state) self.assertTrue(out[3][0].dtype == np.float32) self.assertTrue(out[3][0].shape == (worker_sample_size + 1, 256)) self.assertTrue(out[3][1].dtype == np.float32) self.assertTrue(out[3][1].shape == (worker_sample_size + 1, 256)) environment_stepper.environment_server.stop_server() test.terminate()