def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(self): internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,))) preprocessor_spec = [dict(type="multiply", factor=0.1)] network_spec = config_from_path("configs/test_lstm_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=3), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", internal_states_space=internal_states_space, add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=4, ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry) policy_scope = "environment-stepper/actor-component/policy/" weights_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/kernel"] biases_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/bias"] weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm) lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1]) lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1]) lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1]) expected = ( np.array([False, False, True, False]), np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]), # s' (raw) np.array([ softmax(dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)), ]), # action probs # internal states ( np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])), np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]])) ) ) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_simple_action_adapter(self): # Last NN layer. last_nn_layer_space = FloatBox(shape=(16, ), add_batch_rank=True) # Action Space. action_space = IntBox(2, shape=(3, 2)) action_adapter = ActionAdapter(action_space=action_space, weights_spec=1.0, biases_spec=False, activation="relu") test = ComponentTest(component=action_adapter, input_spaces=dict(nn_output=last_nn_layer_space), action_space=action_space) action_adapter_params = test.read_variable_values( action_adapter.variables) # Batch of 2 samples. inputs = last_nn_layer_space.sample(2) expected_action_layer_output = np.matmul( inputs, action_adapter_params["action-adapter/action-layer/dense/kernel"]) test.test(("get_action_layer_output", inputs), expected_outputs=dict(output=expected_action_layer_output)) expected_logits = np.reshape(expected_action_layer_output, newshape=(2, 3, 2, 2)) expected_probabilities = softmax(expected_logits) expected_log_probs = np.log(expected_probabilities) test.test(("get_logits_probabilities_log_probs", inputs), expected_outputs=dict(logits=expected_logits, probabilities=expected_probabilities, log_probs=expected_log_probs))
def test_action_adapter_with_complex_lstm_output(self): # Last NN layer (LSTM with time rank). last_nn_layer_space = FloatBox(shape=(4,), add_batch_rank=True, add_time_rank=True, time_major=True) # Action Space. action_space = IntBox(2, shape=(3, 2)) action_adapter = ActionAdapter(action_space=action_space, biases_spec=False) test = ComponentTest( component=action_adapter, input_spaces=dict( nn_output=last_nn_layer_space, inputs=[last_nn_layer_space] ), action_space=action_space ) action_adapter_params = test.read_variable_values(action_adapter.variables) # Batch of 2 samples, 3 timesteps. inputs = last_nn_layer_space.sample(size=(3, 2)) # Fold time rank before the action layer pass through. inputs_reshaped = np.reshape(inputs, newshape=(6, -1)) # Action layer pass through and unfolding of time rank. expected_action_layer_output = np.matmul( inputs_reshaped, action_adapter_params["action-adapter/action-network/action-layer/dense/kernel"] ).reshape((3, 2, -1)) # Logits (already well reshaped (same as action space)). expected_logits = np.reshape(expected_action_layer_output, newshape=(3, 2, 3, 2, 2)) test.test(("apply", inputs), expected_outputs=dict(output=expected_logits)) test.test(("get_logits", inputs), expected_outputs=expected_logits) # Softmax (probs). expected_probabilities = softmax(expected_logits) # Log probs. expected_log_probs = np.log(expected_probabilities) test.test(("get_logits_probabilities_log_probs", inputs), expected_outputs=dict( logits=expected_logits, probabilities=expected_probabilities, log_probs=expected_log_probs ), decimals=5)
def test_categorical_cross_entropy_loss_wo_time_rank(self): #time_steps = 3 labels_space = IntBox( 2, shape=(), add_batch_rank=True) #, add_time_rank=time_steps) parameters_space = labels_space.as_one_hot_float_space() loss_per_item_space = FloatBox(shape=(), add_batch_rank=True) #sequence_length_space = IntBox(low=1, high=time_steps+1, shape=(), add_batch_rank=True) categorical_x_entropy_loss_function = CategoricalCrossEntropyLoss() test = ComponentTest( component=categorical_x_entropy_loss_function, input_spaces=dict( labels=labels_space, loss_per_item=loss_per_item_space, #sequence_length=sequence_length_space, parameters=parameters_space)) batch_size = 4 parameters = parameters_space.sample(batch_size) #, time_steps))) probs = softmax(parameters) positive_probs = probs[:, 1] # parameters[:, :, 1] labels = labels_space.sample(batch_size) #, time_steps)) # Calculate binary x-entropy manually here: −[ylog(p) + (1-y)log(1-p)] # iff label (y) is 0: −log(1−[predicted prob for 1]) # iff label (y) is 1: −log([predicted prob for 1]) cross_entropy = np.where(labels == 0, -np.log(1.0 - positive_probs), -np.log(positive_probs)) #sequence_length = sequence_length_space.sample(batch_size) # This code here must be adapted to the exact time-rank reduction schema set within the loss function # in case there is a time-rank. For now, test w/o time rank. #ces = [] #for batch_item, sl in enumerate(sequence_length): # weight = 0.5 # ce_sum = 0.0 # for ce in cross_entropy[batch_item][:sl]: # ce_sum += ce * weight # weight += 0.5 / sequence_length[batch_item] # ces.append(ce_sum / sl) expected_loss_per_item = cross_entropy # np.asarray(ces) expected_loss = np.mean(expected_loss_per_item, axis=0, keepdims=False) test.test( ("loss_per_item", [parameters, labels]), #, sequence_length]), expected_outputs=expected_loss_per_item, decimals=4) test.test(("loss_average", expected_loss_per_item), expected_outputs=expected_loss, decimals=4) # Both. test.test( ("loss", [parameters, labels]), #, sequence_length]), expected_outputs=[expected_loss, expected_loss_per_item], decimals=4)
def test_dueling_action_adapter(self): # Last NN layer. last_nn_layer_space = FloatBox(shape=(7, ), add_batch_rank=True) # Action Space. action_space = IntBox(4, shape=(2, )) action_adapter = DuelingActionAdapter( action_space=action_space, units_state_value_stream=5, units_advantage_stream=4, weights_spec_state_value_stream=1.0, weights_spec_advantage_stream=0.5, activation_advantage_stream="linear", scope="aa") test = ComponentTest(component=action_adapter, input_spaces=dict(nn_output=last_nn_layer_space), action_space=action_space) # Batch of 2 samples. batch_size = 2 inputs = last_nn_layer_space.sample(size=batch_size) dueling_action_adapter_vars = test.read_variable_values( action_adapter.variables) # Expected action layer output are the advantage nodes. expected_raw_advantages = np.matmul( np.matmul( inputs, dueling_action_adapter_vars[ "aa/dense-layer-advantage-stream/dense/kernel"]), dueling_action_adapter_vars["aa/action-layer/dense/kernel"]) expected_state_values = np.matmul( relu( np.matmul( inputs, dueling_action_adapter_vars[ "aa/dense-layer-state-value-stream/dense/kernel"])), dueling_action_adapter_vars["aa/state-value-node/dense/kernel"]) test.test(("get_action_layer_output", inputs), expected_outputs=dict(state_value_node=expected_state_values, output=expected_raw_advantages), decimals=5) expected_advantages = np.reshape(expected_raw_advantages, newshape=(batch_size, 2, 4)) # Expected q-values/logits, probabilities (softmaxed q) and log(p). expanded_state_values = np.expand_dims(expected_state_values, axis=1) expected_q_values = expanded_state_values + expected_advantages - \ np.mean(expected_advantages, axis=-1, keepdims=True) expected_probs = softmax(expected_q_values) test.test(("get_logits_probabilities_log_probs", inputs), expected_outputs=dict(state_values=expected_state_values, logits=expected_q_values, probabilities=expected_probs, log_probs=np.log(expected_probs)), decimals=3)
def test_neg_log_likelihood_loss_function_w_container_space(self): parameters_space = Dict( { # Make sure stddev params are not too crazy (just like our adapters do clipping for the raw NN output). "a": Tuple(FloatBox(shape=(2, 3)), FloatBox( 0.5, 1.0, shape=(2, 3))), # normal (0.0 to 1.0) "b": FloatBox(shape=(4, ), low=-1.0, high=1.0) # 4-discrete }, add_batch_rank=True) labels_space = Dict({ "a": FloatBox(shape=(2, 3)), "b": IntBox(4) }, add_batch_rank=True) loss_per_item_space = FloatBox(add_batch_rank=True) loss_function = NegativeLogLikelihoodLoss( distribution_spec=get_default_distribution_from_space( labels_space)) test = ComponentTest(component=loss_function, input_spaces=dict( parameters=parameters_space, labels=labels_space, loss_per_item=loss_per_item_space)) parameters = parameters_space.sample(2) # Softmax the discrete params. probs_b = softmax(parameters["b"]) #probs_b = parameters["b"] labels = labels_space.sample(2) # Expected loss: Sum of all -log(llh) log_prob_per_item_a = np.sum(np.log( sts.norm.pdf(labels["a"], parameters["a"][0], parameters["a"][1])), axis=(-1, -2)) log_prob_per_item_b = np.array([ np.log(probs_b[0][labels["b"][0]]), np.log(probs_b[1][labels["b"][1]]) ]) expected_loss_per_item = -(log_prob_per_item_a + log_prob_per_item_b) expected_loss = np.mean(expected_loss_per_item, axis=0, keepdims=False) test.test(("loss_per_item", [parameters, labels]), expected_outputs=expected_loss_per_item, decimals=4) test.test(("loss_average", expected_loss_per_item), expected_outputs=expected_loss, decimals=4) # Both. test.test(("loss", [parameters, labels]), expected_outputs=[expected_loss, expected_loss_per_item], decimals=4)
def test_environment_stepper_on_deterministic_env_with_returning_action_probs(self): preprocessor_spec = [dict(type="divide", divisor=2)] network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=6), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=3 ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry) policy_scope = "environment-stepper/actor-component/policy/" weights_hid = weights[policy_scope+"test-network/hidden-layer/dense/kernel"] biases_hid = weights[policy_scope+"test-network/hidden-layer/dense/bias"] weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. expected = ( # t_ np.array([False, False, False]), # s' (raw) np.array([[0.0], [1.0], [2.0], [3.0]]), # action probs np.array([ softmax(dense_layer(dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action)) ]) ) test.test("step", expected_outputs=expected, decimals=3) # Step again, check whether stitching of states/etc.. works. expected = ( np.array([False, False, True]), np.array([[3.0], [4.0], [5.0], [0.0]]), # s' (raw) np.array([ softmax(dense_layer(dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action)) ]) ) test.test("step", expected_outputs=expected, decimals=3) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_v_trace_function_more_complex(self): v_trace_function = VTraceFunction() v_trace_function_reference = VTraceFunction(backend="python") action_space = IntBox(9, add_batch_rank=True, add_time_rank=True, time_major=True) action_space_flat = FloatBox(shape=(9, ), add_batch_rank=True, add_time_rank=True, time_major=True) input_spaces = dict(logits_actions_pi=self.time_x_batch_x_9_space, log_probs_actions_mu=self.time_x_batch_x_9_space, actions=action_space, actions_flat=action_space_flat, discounts=self.time_x_batch_x_1_space, rewards=self.time_x_batch_x_1_space, values=self.time_x_batch_x_1_space, bootstrapped_values=self.time_x_batch_x_1_space) test = ComponentTest(component=v_trace_function, input_spaces=input_spaces) size = (100, 16) logits_actions_pi = self.time_x_batch_x_9_space.sample(size=size) logits_actions_mu = self.time_x_batch_x_9_space.sample(size=size) log_probs_actions_mu = np.log(softmax(logits_actions_mu)) actions = action_space.sample(size=size) actions_flat = one_hot(actions, depth=action_space.num_categories) # Set some discounts to 0.0 (these will mark the end of episodes, where the value is 0.0). discounts = np.random.choice([0.0, 0.99], size=size + (1, ), p=[0.1, 0.9]) rewards = self.time_x_batch_x_1_space.sample(size=size) values = self.time_x_batch_x_1_space.sample(size=size) bootstrapped_values = self.time_x_batch_x_1_space.sample( size=(1, size[1])) input_ = [ logits_actions_pi, log_probs_actions_mu, actions, actions_flat, discounts, rewards, values, bootstrapped_values ] vs_expected, pg_advantages_expected = v_trace_function_reference._graph_fn_calc_v_trace_values( *input_) test.test(("calc_v_trace_values", input_), expected_outputs=[vs_expected, pg_advantages_expected], decimals=4)
def test_categorical(self): # Create 5 categorical distributions of 3 categories each. param_space = FloatBox(shape=(5, 3), low=-1.0, high=2.0, add_batch_rank=True) values_space = IntBox(3, shape=(5, ), add_batch_rank=True) # The Component to test. categorical = Categorical(switched_off_apis={"kl_divergence"}) input_spaces = dict( parameters=param_space, values=values_space, deterministic=bool, ) test = ComponentTest(component=categorical, input_spaces=input_spaces) # Batch of size=3 and deterministic (True). input_ = [input_spaces["parameters"].sample(3), True] expected = np.argmax(input_[0], axis=-1) # Sample n times, expect always max value (max likelihood for deterministic draw). for _ in range(10): test.test(("draw", input_), expected_outputs=expected) test.test(("sample_deterministic", input_[0]), expected_outputs=expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. input_ = [input_spaces["parameters"].sample(3), False] outs = [] for _ in range(20): out = test.test(("draw", input_)) outs.append(out) out = test.test(("sample_stochastic", input_[0])) outs.append(out) recursive_assert_almost_equal(np.mean(outs), 1.0, decimals=1) # Test log-likelihood outputs. input_ = param_space.sample(1) labels = values_space.sample(1) probs = softmax(input_) test.test(("log_prob", [input_, labels]), expected_outputs=np.log( np.array([[ probs[0][0][labels[0][0]], probs[0][1][labels[0][1]], probs[0][2][labels[0][2]], probs[0][3][labels[0][3]], probs[0][4][labels[0][4]] ]])), decimals=4)
def test_simple_action_adapter_with_batch_apply(self): # Last NN layer. previous_nn_layer_space = FloatBox(shape=(16, ), add_batch_rank=True, add_time_rank=True, time_major=True) logits_space = FloatBox(shape=(3, 2, 2), add_batch_rank=True) # Action Space. action_space = IntBox(2, shape=(3, 2)) action_adapter = CategoricalDistributionAdapter( action_space=action_space, weights_spec=1.0, biases_spec=False, fold_time_rank=True, unfold_time_rank=True, activation="relu") test = ComponentTest(component=action_adapter, input_spaces=dict( nn_input=previous_nn_layer_space, logits=logits_space), action_space=action_space) action_adapter_params = test.read_variable_values( action_adapter.variable_registry) # Batch of (4, 5). inputs = previous_nn_layer_space.sample(size=(4, 5)) inputs_folded = np.reshape(inputs, newshape=(20, -1)) expected_action_layer_output = np.matmul( inputs_folded, action_adapter_params[ "action-adapter/action-network/action-layer/dense/kernel"]) expected_logits = np.reshape(expected_action_layer_output, newshape=(4, 5, 3, 2, 2)) test.test(("apply", inputs), expected_outputs=dict(output=expected_logits), decimals=4) test.test(("get_logits", inputs), expected_outputs=expected_logits, decimals=4) expected_parameters = softmax(expected_logits) expected_log_probs = np.log(expected_parameters) test.test(("get_logits_parameters_log_probs", inputs), expected_outputs=dict(logits=expected_logits, parameters=expected_parameters, log_probs=expected_log_probs), decimals=4)
def test_simple_action_adapter(self): # Last NN layer. previous_nn_layer_space = FloatBox(shape=(16, ), add_batch_rank=True) adapter_outputs_space = FloatBox(shape=(3, 2, 2), add_batch_rank=True) # Action Space. action_space = IntBox(2, shape=(3, 2)) action_adapter = CategoricalDistributionAdapter( action_space=action_space, weights_spec=1.0, biases_spec=False, activation="relu") test = ComponentTest(component=action_adapter, input_spaces=dict( inputs=previous_nn_layer_space, adapter_outputs=adapter_outputs_space, ), action_space=action_space) action_adapter_params = test.read_variable_values( action_adapter.variable_registry) # Batch of 2 samples. inputs = previous_nn_layer_space.sample(2) expected_action_layer_output = np.matmul( inputs, action_adapter_params[ "action-adapter/action-network/action-layer/dense/kernel"]) expected_logits = np.reshape(expected_action_layer_output, newshape=(2, 3, 2, 2)) test.test(("call", inputs), expected_outputs=expected_logits, decimals=5) #test.test(("get_logits", inputs), expected_outputs=expected_logits, decimals=5) # w/o the dict expected_probs = softmax(expected_logits) expected_log_probs = np.log(expected_probs) test.test(("get_parameters", inputs), expected_outputs=dict(adapter_outputs=expected_logits, parameters=expected_logits, probabilities=expected_probs, log_probs=expected_log_probs), decimals=5)
def test_simple_actor_component(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(5, ), add_batch_rank=True) # action_space. action_space = IntBox(10) preprocessor = PreprocessorStack.from_spec([ dict(type="convert_type", to_dtype="float"), dict(type="multiply", factor=2) ]) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) exploration = Exploration() # no exploration actor_component = ActorComponent(preprocessor, policy, exploration) test = ComponentTest(component=actor_component, input_spaces=dict(states=state_space), action_space=action_space) # Get and check some actions. actor_component_params = test.read_variable_values( actor_component.variables) # Some state inputs (5 input nodes, batch size=2). states = state_space.sample(2) # Expected NN-output. expected_nn_output = np.matmul( states * 2, actor_component_params[ "actor-component/policy/test-network/hidden-layer/dense/kernel"] ) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params[ "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = states * 2 test.test(("get_preprocessed_state_and_action", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions)) # Get actions and action-probs by calling a different API-method. states = state_space.sample(5) # Get and check some actions. actor_component_params = test.read_variable_values( actor_component.variables) # Expected NN-output. expected_nn_output = np.matmul( states * 2, actor_component_params[ "actor-component/policy/test-network/hidden-layer/dense/kernel"] ) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params[ "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # No reshape necessary (simple action space), softmax to get probs. expected_action_probs = softmax(expected_action_layer_output) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = states * 2 test.test(("get_preprocessed_state_action_and_action_probs", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions, action_probs=expected_action_probs))
def test_joint_cumulative_distribution(self): param_space = Dict( { "a": FloatBox(shape=(4, )), # 4-discrete "b": Dict({ "ba": Tuple([ FloatBox(shape=(3, )), FloatBox(0.1, 1.0, shape=(3, )) ]), # 3-variate normal "bb": Tuple([FloatBox(shape=(2, )), FloatBox(shape=(2, ))]), # beta -1 to 1 "bc": Tuple([ FloatBox(shape=(4, )), FloatBox(0.1, 1.0, shape=(4, )) ]), # normal (dim=4) }) }, add_batch_rank=True) values_space = Dict( { "a": IntBox(4), "b": Dict({ "ba": FloatBox(shape=(3, )), "bb": FloatBox(shape=(2, )), "bc": FloatBox(shape=(4, )) }) }, add_batch_rank=True) input_spaces = dict(parameters=param_space, values=values_space, deterministic=bool) low, high = -1.0, 1.0 joined_cumulative_distribution = JointCumulativeDistribution( distribution_specs={ "/a": Categorical(), "/b/ba": MultivariateNormal(), "/b/bb": Beta(low=low, high=high), "/b/bc": Normal() }, switched_off_apis={"kl_divergence"}) test = ComponentTest(component=joined_cumulative_distribution, input_spaces=input_spaces) # Batch of size=2 and deterministic (True). input_ = [param_space.sample(2), True] input_[0]["a"] = softmax(input_[0]["a"]) expected_mean = { "a": np.argmax(input_[0]["a"], axis=-1), "b": { "ba": input_[0]["b"]["ba"][0], # [0]=Mean # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low "bb": (1.0 / (1.0 + input_[0]["b"]["bb"][1] / input_[0]["b"]["bb"][0])) * (high - low) + low, "bc": input_[0]["b"]["bc"][0], } } # Sample n times, expect always mean value (deterministic draw). for _ in range(50): test.test(("draw", input_), expected_outputs=expected_mean) test.test(("sample_deterministic", tuple([input_[0]])), expected_outputs=expected_mean) # Batch of size=1 and non-deterministic -> expect roughly the mean. input_ = [param_space.sample(1), False] input_[0]["a"] = softmax(input_[0]["a"]) expected_mean = { "a": np.sum(input_[0]["a"] * np.array([0, 1, 2, 3])), "b": { "ba": input_[0]["b"]["ba"][0], # [0]=Mean # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low "bb": (1.0 / (1.0 + input_[0]["b"]["bb"][1] / input_[0]["b"]["bb"][0])) * (high - low) + low, "bc": input_[0]["b"]["bc"][0], } } outs = [] for _ in range(100): out = test.test(("draw", input_)) outs.append(out) out = test.test(("sample_stochastic", tuple([input_[0]]))) outs.append(out) recursive_assert_almost_equal(np.mean(np.stack( [o["a"][0] for o in outs], axis=0), axis=0), expected_mean["a"], atol=0.2) recursive_assert_almost_equal(np.mean(np.stack( [o["b"]["ba"][0] for o in outs], axis=0), axis=0), expected_mean["b"]["ba"][0], decimals=1) recursive_assert_almost_equal(np.mean(np.stack( [o["b"]["bb"][0] for o in outs], axis=0), axis=0), expected_mean["b"]["bb"][0], decimals=1) recursive_assert_almost_equal(np.mean(np.stack( [o["b"]["bc"][0] for o in outs], axis=0), axis=0), expected_mean["b"]["bc"][0], decimals=1) # Test log-likelihood outputs. params = param_space.sample(1) params["a"] = softmax(params["a"]) # Make sure beta-values are within 0.0 and 1.0 for the numpy calculation (which doesn't have scaling). values = values_space.sample(1) log_prob_beta = np.log( beta.pdf(values["b"]["bb"], params["b"]["bb"][0], params["b"]["bb"][1])) # Now do the scaling for b/bb (beta values). values["b"]["bb"] = values["b"]["bb"] * (high - low) + low expected_log_llh = np.log(params["a"][0][values["a"][0]]) + \ np.sum(np.log(norm.pdf(values["b"]["ba"][0], params["b"]["ba"][0], params["b"]["ba"][1]))) + \ np.sum(log_prob_beta) + \ np.sum(np.log(norm.pdf(values["b"]["bc"][0], params["b"]["bc"][0], params["b"]["bc"][1]))) test.test(("log_prob", [params, values]), expected_outputs=expected_log_llh, decimals=1)
def test_mixture(self): # Create a mixture distribution consisting of 3 bivariate normals. num_distributions = 3 num_events_per_multivariate = 2 # 2=bivariate param_space = Dict( { "categorical": FloatBox(shape=(num_distributions, ), low=-1.5, high=2.3), "parameters0": Tuple( FloatBox(shape=(num_events_per_multivariate, )), # mean FloatBox(shape=(num_events_per_multivariate, )), # diag ), "parameters1": Tuple( FloatBox(shape=(num_events_per_multivariate, )), # mean FloatBox(shape=(num_events_per_multivariate, )), # diag ), "parameters2": Tuple( FloatBox(shape=(num_events_per_multivariate, )), # mean FloatBox(shape=(num_events_per_multivariate, )), # diag ), }, add_batch_rank=True) values_space = FloatBox(shape=(num_events_per_multivariate, ), add_batch_rank=True) input_spaces = dict( parameters=param_space, values=values_space, deterministic=bool, ) # The Component to test. mixture = MixtureDistribution( # Try different spec types. MultivariateNormal(), "multi-variate-normal", "multivariate_normal", switched_off_apis={"entropy", "kl_divergence"}) test = ComponentTest(component=mixture, input_spaces=input_spaces) # Batch of size=n and deterministic (True). input_ = [input_spaces["parameters"].sample(1), True] # Make probs for categorical. categorical_probs = softmax(input_[0]["categorical"]) # Note: Usually, the deterministic draw should return the max-likelihood value # Max-likelihood for a 3-Mixed Bivariate: mean-of-argmax(categorical)() # argmax = np.argmax(input_[0]["categorical"], axis=-1) #expected = np.array([input_[0]["parameters{}".format(idx)][0][i] for i, idx in enumerate(argmax)]) # input_[0]["categorical"][:, 1:2] * input_[0]["parameters1"][0] + \ # input_[0]["categorical"][:, 2:3] * input_[0]["parameters2"][0] # The mean value is a 2D vector (bivariate distribution). expected = categorical_probs[:, 0:1] * input_[0]["parameters0"][0] + \ categorical_probs[:, 1:2] * input_[0]["parameters1"][0] + \ categorical_probs[:, 2:3] * input_[0]["parameters2"][0] for _ in range(50): test.test(("draw", input_), expected_outputs=expected) test.test(("sample_deterministic", tuple([input_[0]])), expected_outputs=expected) # Batch of size=1 and non-deterministic -> expect roughly the mean. input_ = [input_spaces["parameters"].sample(1), False] # Make probs for categorical. categorical_probs = softmax(input_[0]["categorical"]) expected = categorical_probs[:, 0:1] * input_[0]["parameters0"][0] + \ categorical_probs[:, 1:2] * input_[0]["parameters1"][0] + \ categorical_probs[:, 2:3] * input_[0]["parameters2"][0] outs = [] for _ in range(50): out = test.test(("draw", input_)) outs.append(out) out = test.test(("sample_stochastic", tuple([input_[0]]))) outs.append(out) recursive_assert_almost_equal(np.mean(np.array(outs), axis=0), expected, decimals=1) # Test log-likelihood outputs (against scipy). params = param_space.sample(1) # Make sure categorical params are softmaxed. category_probs = softmax(params["categorical"][0]) values = values_space.sample(1) expected = \ category_probs[0] * \ np.sum(np.log(norm.pdf(values[0], params["parameters0"][0][0], params["parameters0"][1][0])), axis=-1) + \ category_probs[1] * \ np.sum(np.log(norm.pdf(values[0], params["parameters1"][0][0], params["parameters1"][1][0])), axis=-1) + \ category_probs[2] * \ np.sum(np.log(norm.pdf(values[0], params["parameters2"][0][0], params["parameters2"][1][0])), axis=-1) test.test(("log_prob", [params, values]), expected_outputs=np.array([expected]), decimals=1)
def _graph_fn_calc_v_trace_values(self, logits_actions_pi, log_probs_actions_mu, actions, actions_flat, discounts, rewards, values, bootstrapped_values): """ Returns the V-trace values calculated from log importance weights (see [1] for details). Calculation: vs = V(xs) + SUM[t=s to s+N-1]( gamma^t-s * ( PROD[i=s to t-1](ci) ) * dt_V ) with: dt_V = rho_t * (rt + gamma V(xt+1) - V(xt)) rho_t and ci being the clipped IS weights Args: logits_actions_pi (SingleDataOp): The raw logits output of the pi-network (one logit per discrete action). log_probs_actions_mu (SingleDataOp): The log-probs of the mu-network (one log-prob per discrete action). actions (SingleDataOp): The (int encoded) actually taken actions. actions_flat (SingleDataOp): The one-hot converted actually taken actions. discounts (SingleDataOp): DataOp (time x batch x values) holding the discounts collected when stepping through the environment (for the timesteps s=t to s=t+N-1). rewards (SingleDataOp): DataOp (time x batch x values) holding the rewards collected when stepping through the environment (for the timesteps s=t to s=t+N-1). values (SingleDataOp): DataOp (time x batch x values) holding the the value function estimates wrt. the learner's policy (pi) (for the timesteps s=t to s=t+N-1). bootstrapped_values (SingleDataOp): DataOp (time(1) x batch x values) holding the last (bootstrapped) value estimate to use as a value function estimate after n time steps (V(xs) for s=t+N). Returns: tuple: - v-trace values (vs) in time x batch dimensions used to train the value-function (baseline). - PG-advantage values in time x batch dimensions used for training via policy gradient with baseline. """ # Simplified (not performance optimized!) numpy implementation of v-trace for testing purposes. if get_backend() == "python" or self.backend == "python": probs_actions_pi = softmax(logits_actions_pi, axis=-1) log_probs_actions_pi = np.log(probs_actions_pi) log_is_weights = log_probs_actions_pi - log_probs_actions_mu # log(a/b) = log(a) - log(b) log_is_weights_actions_taken = np.sum(log_is_weights * actions_flat, axis=-1, keepdims=True) is_weights = np.exp(log_is_weights_actions_taken) # rho_t = min(rho_bar, is_weights) = [1.0, 1.0], [0.67032005, 1.0], [1.0, 0.36787944] if self.rho_bar is not None: rho_t = np.minimum(self.rho_bar, is_weights) else: rho_t = is_weights # Same for rho-PG (policy gradients). if self.rho_bar_pg is not None: rho_t_pg = np.minimum(self.rho_bar_pg, is_weights) else: rho_t_pg = is_weights # Calculate ci terms for all timesteps: # ci = min(c_bar, is_weights) = [1.0, 1.0], [0.67032005, 1.0], [1.0, 0.36787944] if self.c_bar is not None: c_i = np.minimum(self.c_bar, is_weights) else: c_i = is_weights # Values t+1 -> shift by one time step. values_t_plus_1 = np.concatenate((values[1:], bootstrapped_values), axis=0) deltas = rho_t * (rewards + discounts * values_t_plus_1 - values) # Reverse everything for recursive v_s calculation. discounts_reversed = discounts[::-1] c_i_reversed = c_i[::-1] deltas_reversed = deltas[::-1] vs_minus_v_xs = [np.zeros_like(np.squeeze(bootstrapped_values, axis=0))] # Do the recursive calculations. for d, c, delta in zip(discounts_reversed, c_i_reversed, deltas_reversed): vs_minus_v_xs.append(delta + d * c * vs_minus_v_xs[-1]) # Convert into numpy array and revert back. vs_minus_v_xs = np.array(vs_minus_v_xs[::-1])[:-1] # Add V(x_s) to get v_s. vs = vs_minus_v_xs + values # Advantage for policy gradient. vs_t_plus_1 = np.concatenate([vs[1:], bootstrapped_values], axis=0) pg_advantages = (rho_t_pg * (rewards + discounts * vs_t_plus_1 - values)) return vs, pg_advantages elif get_backend() == "tf": # Calculate the log IS-weight values via: logIS = log(pi(a|s)) - log(mu(a|s)). # Use the action_probs_pi values only of the actions actually taken. log_probs_actions_taken_pi = tf.expand_dims(-tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_actions_pi, labels=actions ), axis=-1) log_probs_actions_taken_mu = tf.reduce_sum( input_tensor=log_probs_actions_mu * actions_flat, axis=-1, keepdims=True, name="log-probs-actions-taken-mu" ) log_is_weights = log_probs_actions_taken_pi - log_probs_actions_taken_mu is_weights = tf.exp(x=log_is_weights, name="is-weights-from-logs") # Apply rho-bar (also for PG) and c-bar clipping to all IS-weights. if self.rho_bar is not None: rho_t = tf.minimum(x=self.rho_bar, y=is_weights, name="clip-rho-bar") else: rho_t = is_weights if self.rho_bar_pg is not None: rho_t_pg = tf.minimum(x=self.rho_bar_pg, y=is_weights, name="clip-rho-bar-pg") else: rho_t_pg = is_weights if self.c_bar is not None: c_i = tf.minimum(x=self.c_bar, y=is_weights, name="clip-c-bar") else: c_i = is_weights # This is the same vector as `values` except that it will be shifted by 1 timestep to the right and # include - as the last item - the bootstrapped V value at s=t+N. values_t_plus_1 = tf.concat(values=[values[1:], bootstrapped_values], axis=0, name="values-t-plus-1") # Calculate the temporal difference terms (delta-t-V in the paper) for each s=t to s=t+N-1. dt_vs = rho_t * (rewards + discounts * values_t_plus_1 - values) # V-trace values can be calculated recursively (starting from the end of a trajectory) via: # vs = V(xs) + dsV + gamma * cs * (vs+1 - V(s+1)) # => (vs - V(xs)) = dsV + gamma * cs * (vs+1 - V(s+1)) # We will thus calculate all terms: [vs - V(xs)] for all timesteps first, then add V(xs) again to get the # v-traces. elements = ( tf.reverse(tensor=discounts, axis=[0], name="revert-discounts"), tf.reverse(tensor=c_i, axis=[0], name="revert-c-i"), tf.reverse(tensor=dt_vs, axis=[0], name="revert-dt-vs") ) def scan_func(vs_minus_v_xs_, elements_): gamma_t, c_t, dt_v = elements_ return dt_v + gamma_t * c_t * vs_minus_v_xs_ vs_minus_v_xs = tf.scan( fn=scan_func, elems=elements, initializer=tf.zeros_like(tensor=tf.squeeze(bootstrapped_values, axis=0)), parallel_iterations=1, back_prop=False, name="v-trace-scan" ) # Reverse the results back to original order. vs_minus_v_xs = tf.reverse(tensor=vs_minus_v_xs, axis=[0], name="revert-vs-minus-v-xs") # Add V(xs) to get vs. vs = tf.add(x=vs_minus_v_xs, y=values) # Calculate the advantage values (for policy gradient loss term) according to: # A = Q - V with Q based on vs (v-trace) values: qs = rs + gamma * vs and V being the # approximate value function output. vs_t_plus_1 = tf.concat(values=[vs[1:], bootstrapped_values], axis=0) pg_advantages = rho_t_pg * (rewards + discounts * vs_t_plus_1 - values) # Return v-traces and policy gradient advantage values based on: A=r+gamma*v-trace(s+1) - V(s). # With `r+gamma*v-trace(s+1)` also called `qs` in the paper. return tf.stop_gradient(vs), tf.stop_gradient(pg_advantages)