def __init__(self, preprocessor_spec, policy_spec, exploration_spec, max_likelihood=None, **kwargs): """ Args: preprocessor_spec (Union[list,dict,PreprocessorSpec]): - A dict if the state from the Env will come in as a ContainerSpace (e.g. Dict). In this case, each each key in this dict specifies, which value in the incoming dict should go through which PreprocessorStack. - A list with layer specs. - A PreprocessorStack object. policy_spec (Union[dict,Policy]): A specification dict for a Policy object or a Policy object directly. exploration_spec (Union[dict,Exploration]): A specification dict for an Exploration object or an Exploration object directly. max_likelihood (Optional[bool]): See Policy's property `max_likelihood`. If not None, overwrites the equally named setting in the Policy object (defined by `policy_spec`). """ super(ActorComponent, self).__init__(scope=kwargs.pop("scope", "actor-component"), **kwargs) self.preprocessor = PreprocessorStack.from_spec(preprocessor_spec) self.policy = Policy.from_spec(policy_spec) self.exploration = Exploration.from_spec(exploration_spec) self.max_likelihood = max_likelihood self.add_components(self.policy, self.exploration, self.preprocessor)
def __init__(self, preprocessor_spec, policy_spec, exploration_spec=None, **kwargs): """ Args: preprocessor_spec (Union[list,dict,PreprocessorSpec]): - A dict if the state from the Env will come in as a ContainerSpace (e.g. Dict). In this case, each each key in this dict specifies, which value in the incoming dict should go through which PreprocessorStack. - A list with layer specs. - A PreprocessorStack object. policy_spec (Union[dict,Policy]): A specification dict for a Policy object or a Policy object directly. exploration_spec (Union[dict,Exploration]): A specification dict for an Exploration object or an Exploration object directly. """ super(ActorComponent, self).__init__(scope=kwargs.pop("scope", "actor-component"), **kwargs) self.preprocessor = PreprocessorStack.from_spec(preprocessor_spec) self.policy = Policy.from_spec(policy_spec) self.num_nn_inputs = self.policy.neural_network.num_inputs self.exploration = Exploration.from_spec(exploration_spec) self.tuple_merger = ContainerMerger(is_tuple=True, merge_tuples_into_one=True) self.tuple_splitter = ContainerSplitter( tuple_length=self.num_nn_inputs) self.add_components(self.policy, self.exploration, self.preprocessor, self.tuple_merger, self.tuple_splitter)
def test_actor_component_with_lstm_network(self): # state space and internal state space state_space = FloatBox(shape=(2,), add_batch_rank=True, add_time_rank=True, time_major=False) internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)), add_batch_rank=True) time_percentages_space = FloatBox() # action_space. action_space = IntBox(2, add_batch_rank=True, add_time_rank=True) preprocessor = PreprocessorStack.from_spec( [dict(type="convert_type", to_dtype="float"), dict(type="divide", divisor=10)] ) policy = Policy(network_spec=config_from_path("configs/test_lstm_nn.json"), action_space=action_space) exploration = Exploration(epsilon_spec=dict(decay_spec=dict( type="linear_decay", from_=1.0, to_=0.1) )) actor_component = ActorComponent(preprocessor, policy, exploration) test = ComponentTest( component=actor_component, input_spaces=dict( states=state_space, other_nn_inputs=Tuple(internal_states_space, add_batch_rank=True), time_percentage=time_percentages_space ), action_space=action_space ) # Some state inputs (batch size=2, seq-len=1000; batch-major). np.random.seed(10) states = state_space.sample(size=(1000, 2)) initial_internal_states = internal_states_space.zeros(size=2) # only batch time_percentages = time_percentages_space.sample(1000) # Run n times a single time-step to simulate acting and env interaction with an LSTM. preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float) actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int) for i, time_percentage in enumerate(time_percentages): ret = test.test(( "get_preprocessed_state_and_action", # expand time dim at 1st slot as we are time-major == False [np.expand_dims(states[i], 1), tuple([initial_internal_states]), time_percentage] )) preprocessed_states[i] = ret["preprocessed_state"][:, 0, :] # take out time-rank again () actions[i] = ret["action"] # Check c/h-state shape. self.assertEqual(ret["nn_outputs"][1][0].shape, (2, 3)) # batch-size=2, LSTM units=3 self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3)) # Check all preprocessed states (easy: just divided by 10). expected_preprocessed_state = states / 10 recursive_assert_almost_equal(preprocessed_states, expected_preprocessed_state) # Check the exploration functionality over the actions. # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small. stddev_actions = actions.std() self.assertGreater(stddev_actions, 0.4) self.assertLess(stddev_actions, 0.6)
def test_exploration_with_continuous_action_space(self): # TODO not portable, redo with more general mean/stddev checks over a sample of distributed outputs. return # 2x2 action-pick, each composite action with 5 categories. action_space = FloatBox(shape=(2,2), add_batch_rank=True) distribution = Normal() action_adapter = ActionAdapter(action_space=action_space) # Our distribution to go into the Exploration object. nn_output_space = FloatBox(shape=(13,), add_batch_rank=True) # 13: Any flat nn-output should be ok. exploration = Exploration.from_spec(dict(noise_spec=dict(type="gaussian_noise", mean=10.0, stddev=2.0))) # The Component to test. exploration_pipeline = Component(scope="continuous-plus-noise") exploration_pipeline.add_components(action_adapter, distribution, exploration, scope="exploration-pipeline") @rlgraph_api(component=exploration_pipeline) def get_action(self_, nn_output): _, parameters, _ = action_adapter.get_logits_probabilities_log_probs(nn_output) sample_stochastic = distribution.sample_stochastic(parameters) sample_deterministic = distribution.sample_deterministic(parameters) action = exploration.get_action(sample_stochastic, sample_deterministic) return action @rlgraph_api(component=exploration_pipeline) def get_noise(self_): return exploration.noise_component.get_noise() test = ComponentTest(component=exploration_pipeline, input_spaces=dict(nn_output=nn_output_space), action_space=action_space) # Collect outputs in `collected` list to compare moments. collected = list() for _ in range_(1000): test.test("get_noise", fn_test=lambda component_test, outs: collected.append(outs)) self.assertAlmostEqual(10.0, np.mean(collected), places=1) self.assertAlmostEqual(2.0, np.std(collected), places=1) np.random.seed(10) input_ = nn_output_space.sample(size=3) expected = np.array([[[13.163095, 8.46925], [10.375976, 5.4675055]], [[13.239931, 7.990649], [10.03761, 10.465796]], [[10.280741, 7.2384844], [10.040194, 8.248206]]], dtype=np.float32) test.test(("get_action", input_), expected_outputs=expected, decimals=3)
def test_simple_actor_component(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(5, ), add_batch_rank=True) # action_space. action_space = IntBox(10) preprocessor = PreprocessorStack.from_spec([ dict(type="convert_type", to_dtype="float"), dict(type="multiply", factor=2) ]) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) exploration = Exploration() # no exploration actor_component = ActorComponent(preprocessor, policy, exploration) test = ComponentTest(component=actor_component, input_spaces=dict(states=state_space), action_space=action_space) # Get and check some actions. actor_component_params = test.read_variable_values( actor_component.variables) # Some state inputs (5 input nodes, batch size=2). states = state_space.sample(2) # Expected NN-output. expected_nn_output = np.matmul( states * 2, actor_component_params[ "actor-component/policy/test-network/hidden-layer/dense/kernel"] ) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params[ "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = states * 2 test.test(("get_preprocessed_state_and_action", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions)) # Get actions and action-probs by calling a different API-method. states = state_space.sample(5) # Get and check some actions. actor_component_params = test.read_variable_values( actor_component.variables) # Expected NN-output. expected_nn_output = np.matmul( states * 2, actor_component_params[ "actor-component/policy/test-network/hidden-layer/dense/kernel"] ) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params[ "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # No reshape necessary (simple action space), softmax to get probs. expected_action_probs = softmax(expected_action_layer_output) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = states * 2 test.test(("get_preprocessed_state_action_and_action_probs", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions, action_probs=expected_action_probs))
def test_exploration_with_discrete_action_space(self): nn_output_space = FloatBox(shape=(13, ), add_batch_rank=True) time_step_space = IntBox(10000) # 2x2 action-pick, each composite action with 5 categories. action_space = IntBox(5, shape=(2, 2), add_batch_rank=True) # Our distribution to go into the Exploration object. distribution = Categorical() action_adapter = ActionAdapter(action_space=action_space) exploration = Exploration.from_spec( dict(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.0, start_timestep=0, num_timesteps=10000)))) # The Component to test. exploration_pipeline = Component(action_adapter, distribution, exploration, scope="exploration-pipeline") @rlgraph_api(component=exploration_pipeline) def get_action(self_, nn_output, time_step): out = action_adapter.get_logits_probabilities_log_probs(nn_output) sample = distribution.sample_deterministic(out["probabilities"]) action = exploration.get_action(sample, time_step) return action test = ComponentTest(component=exploration_pipeline, input_spaces=dict(nn_output=nn_output_space, time_step=int), action_space=action_space) # With exploration: Check, whether actions are equally distributed. nn_outputs = nn_output_space.sample(2) time_steps = time_step_space.sample(30) # Collect action-batch-of-2 for each of our various random time steps. # Each action is an int box of shape=(2,2) actions = np.ndarray(shape=(30, 2, 2, 2), dtype=np.int) for i, time_step in enumerate(time_steps): actions[i] = test.test(("get_action", [nn_outputs, time_step]), expected_outputs=None) # Assert some distribution of the actions. mean_action = actions.mean() stddev_action = actions.std() self.assertAlmostEqual(mean_action, 2.0, places=0) self.assertAlmostEqual(stddev_action, 1.0, places=0) # Without exploration (epsilon is force-set to 0.0): Check, whether actions are always the same # (given same nn_output all the time). nn_outputs = nn_output_space.sample(2) time_steps = time_step_space.sample(30) + 10000 # Collect action-batch-of-2 for each of our various random time steps. # Each action is an int box of shape=(2,2) actions = np.ndarray(shape=(30, 2, 2, 2), dtype=np.int) for i, time_step in enumerate(time_steps): actions[i] = test.test(("get_action", [nn_outputs, time_step]), expected_outputs=None) # Assert zero stddev of the single action components. stddev_action_a = actions[:, 0, 0, 0].std( ) # batch item 0, action-component (0,0) self.assertAlmostEqual(stddev_action_a, 0.0, places=1) stddev_action_b = actions[:, 1, 1, 0].std( ) # batch item 1, action-component (1,0) self.assertAlmostEqual(stddev_action_b, 0.0, places=1) stddev_action_c = actions[:, 0, 0, 1].std( ) # batch item 0, action-component (0,1) self.assertAlmostEqual(stddev_action_c, 0.0, places=1) stddev_action_d = actions[:, 1, 1, 1].std( ) # batch item 1, action-component (1,1) self.assertAlmostEqual(stddev_action_d, 0.0, places=1) self.assertAlmostEqual(actions.std(), 1.0, places=0)
def test_exploration_with_discrete_container_action_space(self): nn_output_space = FloatBox(shape=(12, ), add_batch_rank=True) time_step_space = IntBox(10000) # Some container action space. action_space = Dict(dict(a=IntBox(3), b=IntBox(2), c=IntBox(4)), add_batch_rank=True) # Our distribution to go into the Exploration object. distribution_a = Categorical(scope="d_a") distribution_b = Categorical(scope="d_b") distribution_c = Categorical(scope="d_c") action_adapter_a = ActionAdapter(action_space=action_space["a"], scope="aa_a") action_adapter_b = ActionAdapter(action_space=action_space["b"], scope="aa_b") action_adapter_c = ActionAdapter(action_space=action_space["c"], scope="aa_c") exploration = Exploration.from_spec( dict(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.0, start_timestep=0, num_timesteps=10000)))) # The Component to test. exploration_pipeline = Component(action_adapter_a, action_adapter_b, action_adapter_c, distribution_a, distribution_b, distribution_c, exploration, scope="exploration-pipeline") @rlgraph_api(component=exploration_pipeline) def get_action(self_, nn_output, time_step): out_a = action_adapter_a.get_logits_probabilities_log_probs( nn_output) out_b = action_adapter_b.get_logits_probabilities_log_probs( nn_output) out_c = action_adapter_c.get_logits_probabilities_log_probs( nn_output) sample_a = distribution_a.sample_deterministic( out_a["probabilities"]) sample_b = distribution_b.sample_deterministic( out_b["probabilities"]) sample_c = distribution_c.sample_deterministic( out_c["probabilities"]) sample = self_._graph_fn_merge_actions(sample_a, sample_b, sample_c) action = exploration.get_action(sample, time_step) return action @graph_fn(component=exploration_pipeline) def _graph_fn_merge_actions(self, a, b, c): return DataOpDict(a=a, b=b, c=c) test = ComponentTest(component=exploration_pipeline, input_spaces=dict(nn_output=nn_output_space, time_step=int), action_space=action_space) # With exploration: Check, whether actions are equally distributed. batch_size = 2 num_time_steps = 30 nn_outputs = nn_output_space.sample(batch_size) time_steps = time_step_space.sample(num_time_steps) # Collect action-batch-of-2 for each of our various random time steps. actions_a = np.ndarray(shape=(num_time_steps, batch_size), dtype=np.int) actions_b = np.ndarray(shape=(num_time_steps, batch_size), dtype=np.int) actions_c = np.ndarray(shape=(num_time_steps, batch_size), dtype=np.int) for i, t in enumerate(time_steps): a = test.test(("get_action", [nn_outputs, t]), expected_outputs=None) actions_a[i] = a["a"] actions_b[i] = a["b"] actions_c[i] = a["c"] # Assert some distribution of the actions. mean_action_a = actions_a.mean() stddev_action_a = actions_a.std() self.assertAlmostEqual(mean_action_a, 1.0, places=0) self.assertAlmostEqual(stddev_action_a, 1.0, places=0) mean_action_b = actions_b.mean() stddev_action_b = actions_b.std() self.assertAlmostEqual(mean_action_b, 0.5, places=0) self.assertAlmostEqual(stddev_action_b, 0.5, places=0) mean_action_c = actions_c.mean() stddev_action_c = actions_c.std() self.assertAlmostEqual(mean_action_c, 1.5, places=0) self.assertAlmostEqual(stddev_action_c, 1.0, places=0) # Without exploration (epsilon is force-set to 0.0): Check, whether actions are always the same # (given same nn_output all the time). nn_outputs = nn_output_space.sample(batch_size) time_steps = time_step_space.sample(num_time_steps) + 10000 # Collect action-batch-of-2 for each of our various random time steps. actions_a = np.ndarray(shape=(num_time_steps, batch_size), dtype=np.int) actions_b = np.ndarray(shape=(num_time_steps, batch_size), dtype=np.int) actions_c = np.ndarray(shape=(num_time_steps, batch_size), dtype=np.int) for i, t in enumerate(time_steps): a = test.test(("get_action", [nn_outputs, t]), expected_outputs=None) actions_a[i] = a["a"] actions_b[i] = a["b"] actions_c[i] = a["c"] # Assert zero stddev of the single action components. stddev_action = actions_a[:, 0].std() # batch item 0, action-component a self.assertAlmostEqual(stddev_action, 0.0, places=1) stddev_action = actions_a[:, 1].std() # batch item 1, action-component a self.assertAlmostEqual(stddev_action, 0.0, places=1) stddev_action = actions_b[:, 0].std() # batch item 0, action-component b self.assertAlmostEqual(stddev_action, 0.0, places=1) stddev_action = actions_b[:, 1].std() # batch item 1, action-component b self.assertAlmostEqual(stddev_action, 0.0, places=1) stddev_action = actions_c[:, 0].std() # batch item 0, action-component c self.assertAlmostEqual(stddev_action, 0.0, places=1) stddev_action = actions_c[:, 1].std() # batch item 1, action-component c self.assertAlmostEqual(stddev_action, 0.0, places=1)
def test_environment_stepper_on_deepmind_lab(self): try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("DeepmindLab not installed: Skipping this test case.") return env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only divide and flatten the image). [{ "type": "divide", "divisor": 255 }, { "type": "reshape", "flatten": True }], # Policy spec. dict(network_spec="../configs/test_lstm_nn.json", action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space_test_lstm, num_steps=1000, # Add both prev-action and -reward into the state sent through the network. #add_previous_action_to_state=True, #add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True)) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset") # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.monotonic() steps = 10 out = None for _ in range(steps): out = test.test("step") time_total = time.monotonic() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)" .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[0].dtype == np.float32) #self.assertTrue(out[0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[0].max() <= 1.0) #self.assertTrue(out[1].dtype == np.int32) # actions #self.assertTrue(out[2].dtype == np.float32) # rewards #self.assertTrue(out[0].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) # action probs (test whether sum to one). #self.assertTrue(out[1][6].dtype == np.float32) #self.assertTrue(out[1][6].min() >= 0.0) #self.assertTrue(out[1][6].max() <= 1.0) #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False), # np.ones(shape=(environment_stepper.num_steps,)), decimals=4) # internal states (c- and h-state) self.assertTrue(out[3][0].dtype == np.float32) self.assertTrue(out[3][1].dtype == np.float32) self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3)) self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3)) # Check whether episode returns match single rewards (including terminal signals). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[0][i] # self.assertAlmostEqual(episode_returns, out[3][i]) # # Terminal: Reset for next step. # if out[4][i] is np.bool_(True): # episode_returns = 0.0 test.terminate()