def test_linear_parameter_using_global_time_step(self): max_time_steps = 100 linear_decay = Decay.make("linear-decay", from_=2.0, to_=0.5, max_time_steps=max_time_steps) # Call without any parameters -> force component to use GLOBAL_STEP, which should be 0 right now -> no decay. for time_step in range(30): out = linear_decay() check(out, 2.0 - (time_step / max_time_steps) * (2.0 - 0.5))
def test_dqn2015_loss_function(self): # Batch of size=2. input_ = { "x": np.random.random(size=(2, 2)), # states don't matter for this test as Q-funcs are faked. "a": np.array([0, 1]), "r": np.array([9.4, -1.23]), "t": np.array([False, False]), "x_": np.random.random(size=(2, 2)) # states don't matter for this test as Q-funcs are faked. } # Fake q-nets. Just have to be callables, returning some q-values. q_net = lambda s, a: np.array([10.0, -90.6]) target_q_net = lambda s_: np.array([[12.0, -8.0], [22.3, 10.5]]) """ Calculation: batch of 2, gamma=1.0 Qt(s'a') = [12 -8] [22.3 10.5] -> max(a') = [12] [22.3] Q(s,a) = [10.0] [-90.6] L = E(batch)| 0.5((r + gamma max(a')Qt(s'a') ) - Q(s,a))^2 | L = (0.5(9.4 + 1.0*12 - 10.0)^2 + 0.5(-1.23 + 1.0*22.3 - -90.6)^2) / 2 L = (0.5(129.96) + 0.5(12470.1889)) / 2 L = (64.98 + 6235.09445) / 2 L = 3150.037225 """ # Batch size=2 -> Expect 2 values returned by `loss_per_item`. expected_loss_per_item = np.array([64.979996, 6235.09445], dtype=np.float32) # Expect the mean over the batch. expected_loss = expected_loss_per_item.mean() out = DQN2015Loss()(input_, q_net, target_q_net, namedtuple("FakeDQN2015Config", ["gamma"])(gamma=1.0)) check(out.numpy(), expected_loss, decimals=2)
def sync_from(self, other_model, tau=1.0): """ Syncs all of this Model's weights from the `other_model` using the formula: new_weights = old_weights sync_tau [new weights] = tau * [other_model's weights] + (1.0 - tau) * [old weights] Args: other_model (Model): The other Model to sync from. tau (float): Teh tau parameter used for soft-syncing (see formula above). """ other_values = other_model.get_weights(as_ref=False) if AssertModelSync is True: try: check(self.get_weights(), other_values) print("WARNING: model weights were equal.") except AssertionError: pass if tau == 1.0: self.set_weights(other_values) else: our_vars = self.get_weights(as_ref=True) for our_var, other_var in zip(our_vars, other_values): tf.keras.backend.set_value( our_var, tau * other_var + (1.0 - tau) * our_var) if AssertModelSync is True: check(self.get_weights(), other_values)
def test_flatten_alongside(self): space = Dict( { "a": Float(shape=(4, )), "b": Dict({ "ba": Tuple([Float(shape=(3, )), Float(0.1, 1.0, shape=(3, ))]), "bb": Tuple([Float(shape=(2, )), Float(shape=(2, ))]), "bc": Tuple([Float(shape=(4, )), Float(0.1, 1.0, shape=(4, ))]), }) }, main_axes="B") # Flatten alongside this structure. alongside = dict(a=True, b=dict(ba=False, bc=None, bb="foo")) input_ = space.sample(2) # Expect to only flatten until ["b"]["ba/b/c"], not into the Tuples as `alongside` does not have these. out = flatten_alongside(input_, alongside) expected = [ input_["a"], input_["b"]["ba"], input_["b"]["bb"], input_["b"]["bc"] ] check(out, expected)
def test_polynomial_parameter_using_global_time_step(self): max_time_steps = 10 polynomial_decay = Decay.make("polynomial-decay", from_=3.0, to_=0.5, max_time_steps=max_time_steps) # Call without any parameters -> force component to use internal `current_time_step`. # Go over the max time steps and expect time_percentage to be capped at 1.0. for time_step in range(50): out = polynomial_decay() check(out, (3.0 - 0.5) * (1.0 - min(time_step / max_time_steps, 1.0)) ** 2 + 0.5)
def test_exponential_parameter_using_global_time_step(self): max_time_steps = 10 decay_rate = 0.1 exponential_decay = ExponentialDecay.make( from_=3.0, to_=0.5, max_time_steps=max_time_steps, decay_rate=decay_rate ) # Call without any parameters -> force component to use internal `current_time_step`. # Go over the max time steps and expect time_percentage to be capped at 1.0. for time_step in range(100): out = exponential_decay() check(out, 0.5 + (3.0 - 0.5) * decay_rate ** min(time_step / max_time_steps, 1.0))
def test_gradient_tape(self): # Var to optimize. var = tf.Variable(random.random()) # Derivative of loss is dL/dv = 2*(v-1.0) = 2v - 2 expected_grad = 2 * var.numpy() - 2.0 # Must use gradient tape as we are in eager mode. In graph mode, we would do `get_gradients`, which does # not work here. with tf.GradientTape() as t: loss = self.L(var) check(t.gradient(loss, var), expected_grad)
def test_dddqn_loss_function(self): """ Tests the dueling/double q-loss function assuming an n-step of 1. """ # Batch of size=2. input_ = { "s": np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]), "a": np.array([2, 1]), "r": np.array([10.3, -4.25]), "t": np.array([False, True]), # make s' distinguishable from s via its values for the fake q-net to notice. "s_": np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]), "n": np.array([1, 1]) } # Fake q-nets. Just have to be callables. # The q-net is first called by the loss function with s', then with s. Detect this difference here and return # different q-values for the different states. q_net = lambda s: dict(A=np.array([[-12.3, 1.2, 1.4], [12.2, -11.5, 9.2]]) if s[0][0] == 1.0 else \ np.array([[10.0, -10.0, 12.4], [-0.101, -4.6, -9.3]]), V=np.array([1.0, -2.0])) target_q_net = lambda s_: dict(A=np.array([[-10.3, 1.5, 1.4], [8.2, -10.9, 9.3]]), V=np.array([0.1, -0.2])) """ Calculation: batch of 2, gamma=0.9 a' = [2 0] <- argmax(a')Q(s'a') Qt(s'.) = 0.1+[-10.3, 1.5, 1.4]--2.4666(A-avg) -0.2+[8.2, -10.9, 9.3]-2.2(A-avg) -> Qt(s'a') = \ [0.1+1.4+2.4666=3.9666] [0.0 <- terminal=True] a = [2 1] Q(s,a) = 1.0+[12.4]-4.1333(A-avg) -2.0+[-4.6]--4.667(A-avg) = [9.2667 -1.933] L = E(batch)| 0.5((r + gamma Qt(s'( argmax(a') Q(s'a') )) ) - Q(s,a))^2 | L = (0.5(10.3 + 0.9*3.9666 - 9.2667)^2 + 0.5(-4.25 + 0.9*0.0 - -1.933)^2) / 2 L = (0.5(4.60324)^2 + 0.5(-2.317)^2) / 2 <- td-errors are the numbers inside the (...)^2 brackets L = (21.1898184976 + 5.368489) / 4 L = 26.5583074976 / 4 L = 6.6395768744 """ # Expect the mean over the batch. expected_loss = 6.6395768744 expected_td_errors = [4.60333333, 2.317] # absolute values out = DDDQNLoss()(input_, q_net, target_q_net, namedtuple("FakeDDDQNConfig", ["gamma"])(gamma=0.9)) check(out[0].numpy(), expected_loss, decimals=3) check(out[1].numpy(), expected_td_errors, decimals=2)
def test_get_records(self): """ Tests if retrieval correctly manages capacity. """ capacity = 10 memory = ReplayBuffer(record_space=self.record_space, capacity=capacity) # Insert 1 record. data = self.record_space.sample(1) memory.add_records(data) # Assert we can now fetch 2 elements. retrieved_data = memory.get_records(num_records=1) self.assertEqual(1, len(retrieved_data["terminals"])) check(data, retrieved_data) # Test duplicate sampling. retrieved_data = memory.get_records(num_records=5) self.assertEqual(5, len(retrieved_data["terminals"])) # Only one record in the memory -> returned samples should all be the exact same. check(retrieved_data["reward"][0], retrieved_data["reward"][1]) check(retrieved_data["reward"][0], retrieved_data["reward"][2]) check(retrieved_data["reward"][0], retrieved_data["reward"][3]) check(retrieved_data["reward"][0], retrieved_data["reward"][4]) # Now insert another one. data = self.record_space.sample() # w/o batch rank memory.add_records(data) # Pull exactly two records and make sure they are NOT(!) the same. retrieved_data = memory.get_records(num_records=2) self.assertEqual(2, len(retrieved_data["terminals"])) self.assertNotEqual(retrieved_data["reward"][0], retrieved_data["reward"][1]) # Now insert over capacity. data = self.record_space.sample(capacity) memory.add_records(data) # Assert we can fetch exactly capacity elements. retrieved_data = memory.get_records(num_records=capacity) self.assertEqual(capacity, len(retrieved_data["terminals"]))
def test_dddqn_learning_on_grid_world_2x2(self): # Create an Env object. env = GridWorld("2x2", actors=1) # Add the preprocessor. preprocessor = Preprocessor( lambda inputs_: tf.one_hot(inputs_, depth=env.actors[0].state_space.num_categories) ) # Create a Config. dqn_config = DDDQNConfig.make( "{}/../configs/dddqn_grid_world_2x2_learning.json".format(os.path.dirname(__file__)), preprocessor=preprocessor, state_space=env.actors[0].state_space, action_space=env.actors[0].action_space ) # Create an Algo object. algo = DDDQN(config=dqn_config, name="my-dddqn") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) # Run and wait for env to complete. env.run(ticks=3000, sync=True, render=debug.RenderEnvInLearningTests) # Check last n episode returns. n = 10 mean_last_n = np.mean(env.historic_episodes_returns[-n:]) print("Avg return over last {} episodes: {}".format(n, mean_last_n)) self.assertTrue(mean_last_n >= 0.6) # Check learnt Q-function (using our dueling layer). a_and_v = algo.Q(one_hot(np.array([0, 0, 0, 0, 1, 1, 1, 1]), depth=4)) q = dueling(a_and_v, np.array([0, 1, 2, 3, 0, 1, 2, 3])) print(q) self.assertTrue(q[1] < min(q[2:]) and q[1] < q[0]) # q(s=0,a=right) is the worst check(q[5], 1.0, atol=0.4) # Q(1,->) is close to 1.0. #self.assertTrue(q[5] > max(q[:4]) and q[5] > max(q[6:])) # q(s=1,a=right) is the best #check(q, [0.8, -5.0, 0.9, 0.8, 0.8, 1.0, 0.9, 0.9], decimals=1) # a=up,down,left,right env.terminate()
def test_neg_log_likelihood_loss_function_w_simple_space(self): shape = (5, 4, 3) parameters_space = Tuple(Float(shape=shape), Float(shape=shape), main_axes="B") labels_space = Float(shape=shape, main_axes="B") loss_function = NegLogLikelihoodLoss( distribution=get_default_distribution_from_space(labels_space)) parameters = parameters_space.sample(10) # Make sure stddev params are not too crazy (just like our adapters do clipping for the raw NN output). parameters = (parameters[0], np.clip(parameters[1], 0.1, 1.0)) labels = labels_space.sample(10) expected_loss_per_item = np.sum( -np.log(sts.norm.pdf(labels, parameters[0], parameters[1])), axis=(-1, -2, -3)) out = loss_function(parameters, labels) check(out, expected_loss_per_item, decimals=4)
def test_apply_gradients(self): lr = random.random() optimizer = SGD(learning_rate=lr) # Var to optimize. var = tf.Variable(random.random()) var_value_orig = var.numpy() # Derivative of loss is dL/dv = 2*(v-1.0) = 2v - 2 expected_grad = 2 * var_value_orig - 2.0 # Must use gradient tape as we are in eager mode. In graph mode, we would do `get_gradients`, which does # not work here. with tf.GradientTape() as t: loss = self.L(var) optimizer.apply_gradients(grads_and_vars=[(t.gradient(loss, var), var)]) # Check against variable now. Should change by -learning_rate * grad. var_value_after = var.numpy() expected_new_value = var_value_orig - (lr * expected_grad) check(var_value_after, expected_new_value)
def test_neg_log_likelihood_loss_function_w_container_space(self): parameters_space = Dict( { # Make sure stddev params are not too crazy (just like our adapters do clipping for the raw NN output). "a": Tuple(Float(shape=(2, 3)), Float( 0.5, 1.0, shape=(2, 3))), # normal (0.0 to 1.0) "b": Float(shape=(4, ), low=-1.0, high=1.0) # 4-discrete }, main_axes="B") labels_space = Dict({ "a": Float(shape=(2, 3)), "b": Int(4) }, main_axes="B") loss_function = NegLogLikelihoodLoss( distribution=get_default_distribution_from_space(labels_space)) parameters = parameters_space.sample(2) # Softmax the discrete params. probs_b = softmax(parameters["b"]) # probs_b = parameters["b"] labels = labels_space.sample(2) # Expected loss: Sum of all -log(llh) log_prob_per_item_a = np.sum(np.log( sts.norm.pdf(labels["a"], parameters["a"][0], parameters["a"][1])), axis=(-1, -2)) log_prob_per_item_b = np.array([ np.log(probs_b[0][labels["b"][0]]), np.log(probs_b[1][labels["b"][1]]) ]) expected_loss_per_item = -(log_prob_per_item_a + log_prob_per_item_b) out = loss_function(parameters, labels) check(out, expected_loss_per_item, decimals=4)
def test_minimize(self): # Test case not working w/o graph mode. return lr = random.random() optimizer = Adam(learning_rate=lr) # Var to optimize. var = tf.Variable(random.random()) var_value_orig = var.numpy() # Derivative of loss is dL/dv = 2*(v-1.0) = 2v - 2 expected_grad = 2 * var_value_orig - 2.0 # Must use gradient tape as we are in eager mode. In graph mode, we would do `get_gradients`, which does # not work here. with tf.GradientTape() as t: loss = self.L(var) optimizer.minimize(loss, [var]) # Check against variable now. Should change by -learning_rate * grad. var_value_after = var.numpy() expected_new_value = var_value_orig - (lr * expected_grad) check(var_value_after, expected_new_value)
def test_dads_learning_on_grid_world_4room(self): # Create an Env object. env = GridWorld("4-room") # Add the preprocessor. preprocessor = Preprocessor( lambda inputs_: tf.one_hot(inputs_, depth=env.actors[0].state_space.num_categories) ) # Create a Config. config = DADSConfig.make( "{}/../configs/dads_grid_world_4room_learning.json".format(os.path.dirname(__file__)), preprocessor=preprocessor, state_space=env.actors[0].state_space, action_space=env.actors[0].action_space ) # Create an Algo object. algo = DADS(config=config, name="my-dads") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) # Run and wait for env to complete. env.run(ticks=3000, sync=True, render=debug.RenderEnvInLearningTests) # Check last n episode returns. n = 10 mean_last_n = np.mean(env.historic_episodes_returns[-n:]) print("Avg return over last {} episodes: {}".format(n, mean_last_n)) self.assertTrue(mean_last_n >= 0.3) # Check learnt Q-function. check(algo.q( np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]) ), [[0.8, -5.0, 0.9, 0.8], [0.8, 1.0, 0.9, 0.9]], decimals=1) # a=up,down,left,right env.terminate()
def test_next_state_handling(self): """ Tests if next-states can be stored efficiently (not using any space!) in the memory. NOTE: The memory does not care about terminal signals, it will always return the n-next-in-memory state regardless of whether this is a useful state (terminal=False) or not (terminal=True). In case of a terminal=True, the next state (whether it be the true terminal state, the reset state, or any other random state) does not matter anyway. """ capacity = 10 batch_size = 2 # Test all classes of memories. for class_ in [ReplayBuffer, PrioritizedReplayBuffer]: memory = class_(record_space=self.record_space_no_next_state, capacity=capacity, next_record_setup=dict(s="s_")) # Insert n records (inserts must always be batch-size). data = dict( s=dict(s1=np.array([0.0, 1.0]), s2=np.array([2.0, 3.0])), a=np.array([0, 1]), r=np.array([-0.0, -1.0]), t=np.array([False, True]), s_=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])) ) memory.add_records(data) # Check, whether inserting the wrong batch size raises Exception. try: data = self.record_space_no_next_state.sample(batch_size + 1) data["s_"] = self.record_space_no_next_state["s"].sample(batch_size) memory.add_records(data) assert False, "ERROR: Should not get here. Error is expected." except SurrealError: pass # Assert we can now fetch n elements. retrieved_data = memory.get_records(num_records=1) self.assertEqual(1, len(retrieved_data["t"])) # Check the next state. if retrieved_data["s"]["s1"][0] == 0.0: self.assertTrue(retrieved_data["s_"]["s1"] == 0.1 and retrieved_data["s_"]["s2"] == 2.1) else: self.assertTrue(retrieved_data["s"]["s1"] == 1.0) self.assertTrue(retrieved_data["s_"]["s1"] == 1.1 and retrieved_data["s_"]["s2"] == 3.1) # Insert another 2xn records and then check for correct next-state returns when getting records. data = dict( s=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])), a=np.array([2, 3]), r=np.array([-2.0, -3.0]), t=np.array([False, False]), s_=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])) ) memory.add_records(data) data = dict( s=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])), a=np.array([4, 5]), r=np.array([-4.0, -5.0]), t=np.array([True, True]), s_=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])) ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=2) self.assertEqual(2, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(2): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1) self.assertTrue(memory.size == 6) # Insert up to capacity and check again. data = dict( s=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])), a=np.array([6, 7]), r=np.array([-6.0, -7.0]), t=np.array([True, False]), s_=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])) ) memory.add_records(data) data = dict( s=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])), a=np.array([8, 9]), r=np.array([-8.0, -9.0]), t=np.array([False, False]), s_=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])) ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=3) self.assertEqual(3, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(3): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1) self.assertTrue(memory.size == 10) # Go a little bit (one batch) over capacity and check again. data = dict( s=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])), a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([True, True]), s_=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6])) ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=4) self.assertEqual(4, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(4): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1) self.assertTrue(memory.size == 10)
def test_constant(self): constant = Constant.make(2.0) input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23]) out = constant(input_) check(out, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
def test_sac_learning_on_grid_world_2x2(self): # Create an Env object. env = GridWorld("2x2", actors=1) # Add the preprocessor (not really necessary, as NN will automatically one-hot, but faster as states # are then stored in memory already preprocessed and won't have to be preprocessed again for batch-updates). preprocessor = Preprocessor(lambda inputs_: tf.one_hot( inputs_, depth=env.actors[0].state_space.num_categories)) # Create a Config. config = SACConfig.make( "{}/../configs/sac_grid_world_2x2_learning.json".format( os.path.dirname(__file__)), preprocessor=preprocessor, state_space=env.actors[0].state_space, action_space=env.actors[0].action_space, summaries=[ "Ls_critic[0]", "L_actor", "L_alpha", "alpha", ("Q(0,^)", "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([0])})" ), ("Q(0,->)", "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([1])})" ), ("Q(0,v)", "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([2])})" ), ("Q(0,<-)", "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([3])})" ), ("Q(1,->)", "Q[0]({'s': np.array([[0., 1., 0., 0.]]), 'a': np.array([1])})" ) ]) # Create an Algo object. algo = SAC(config=config, name="my-sac") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) # Run and wait for env to complete. env.run(ticks=700, sync=True, render=debug.RenderEnvInLearningTests) # Check learnt Q-function. q = algo.Q[0](dict(s=one_hot(np.array([0, 0, 0, 0, 1, 1, 1, 1]), depth=4), a=np.array([0, 1, 2, 3, 0, 1, 2, 3]))) print(q) self.assertTrue(q[1] < min(q[2:]) and q[1] < q[0]) # q(s=0,a=right) is the worst check(q[5], 1.0, decimals=1) # Q(1,->) is close to 1.0. #check(q, [0.8, -5.0, 0.9, 0.8, 0.8, 1.0, 0.9, 0.9], decimals=1) # a=up,down,left,right # Check last n episode returns. n = 10 mean_last_n = np.mean(env.historic_episodes_returns[-n:]) print("Avg return over last {} episodes: {}".format(n, mean_last_n)) self.assertTrue(mean_last_n >= 0.7) env.terminate()
def test_sac_loss_function(self): # Batch of size=2. input_ = { "s": np.random.random(size=( 2, 2)), # states don't matter for this test as Q-funcs are faked. "a": np.array([[-0.5], [0.5]]), # action space = Float(shape=(1,)) "r": np.array([9.4, -1.23]), "t": np.array([False, False]), "s_": np.random.random(size=( 2, 2)) # states don't matter for this test as Q-funcs are faked. } # Fake pi/q-nets. Just have to be callables, returning some q-values. def pi(s, log_likelihood=False): assert log_likelihood is True # Return fake action sample and log-likelihoods. # Actions according to action-space (Float(1,)), log-likelihoods always with shape=(). return np.array([[-0.5], [0.5]]), np.array([-0.4, -1.0]) pi.get_weights = lambda as_ref: [] gamma = 1.0 q_nets = [ lambda s_a: np.array([10.0, -90.6]), lambda s_a: np.array([10.1, -90.5]) ] q_nets[0].get_weights = lambda as_ref: [] q_nets[1].get_weights = lambda as_ref: [] target_q_nets = [ lambda s_a: np.array([12.0, -8.0]), lambda s_a: np.array([22.3, 10.5]) ] target_q_nets[0].get_weights = lambda as_ref: [] target_q_nets[1].get_weights = lambda as_ref: [] alpha = tf.Variable(0.5, dtype=tf.float64) entropy_target = 0.97 out = SACLoss()( input_, alpha, entropy_target, pi, q_nets, target_q_nets, namedtuple("FakeSACConfig", ["gamma", "entropy_target", "optimize_alpha"])( gamma=gamma, entropy_target=entropy_target, optimize_alpha=True)) # Critic Loss. """ Calculation: batch of 2, gamma=1.0 a' = pi(s') = [-0.5, 0.5] a' lllh = [-0.4, -1.0] -> sampled a's log likelihoods Q1t(s'a') = [12 -8] Q2t(s'a') = [22.3 10.5] Qt(s'a') = [12 -8] (reduce min over two Q-nets) Q1(s'a') = [10 -90.6] Q2(s'a') = [10.1 -90.5] Li = E(batch)| 0.5( (r + gamma (Qt(s'a') - alpha*log(pi(a'|s'))) ) - Qi(s,a))^2 | L1 = 0.5 * | (9.4 + (12 - 0.5*-0.4) - 10)^2 + (-1.23 + (-8 - 0.5*-1.0) - -90.6)^2 | / 2 L1 = 0.5 * | (11.6)^2 + (81.87)^2 | / 2 L1 = 3418.62845 / 2 L1 = 1709.314225 L2 = 0.5 * | (9.4 + (12 - 0.5*-0.4) - 10.1)^2 + (-1.23 + (-8 - 0.5*-1.0) - -90.5)^2 | / 2 L2 = 0.5 * | (11.5)^2 + (81.77)^2 | / 2 L2 = 3409.29145 / 2 L2 = 1704.645725 """ expected_critic_loss = [np.array(1709.314225), np.array(1704.645725)] check([out[0][i].numpy() for i in range(2)], expected_critic_loss, decimals=3) # Actor loss. """ Calculation: batch of 2, gamma=1.0 log(pi(a|s)) = a lllh = [-0.4, -1.0] Q1(s,a) = [10.0, -90.6] Q2(s,a) = [10.1, -90.5] Q(s,a) = [10.0, -90.6] <- reduce_min L = E(batch)| ( alpha * log(pi(a,s)) - Q(s,a)) | L = [(alpha * -0.4 - 10.0) + (alpha * -1.0 - -90.6)] / 2 L = [(0.5*-0.4 - 10.0) + (0.5*-1.0 - - 90.6)] / 2 L = (-10.2 + 90.1) / 2 L = 39.95 """ expected_actor_loss = 39.95 check(out[3].numpy(), expected_actor_loss, decimals=3) # Alpha loss. """ Calculation: batch of 2, gamma=1.0 H = entropy_target = 0.97 log(pi(a|s)) = a lllh = [-0.4, -1.0] L = E(batch)| (-alpha * log(pi(a,s)) - alpha H) | # In the SAC-paper, α is used directly, however the implementation uses log(α). # See the discussion in https://github.com/rail-berkeley/softlearning/issues/37. L = [(-log(alpha) * -0.4 - log(alpha)*0.97) + (-log(alpha) * -1.0 - log(alpha) * 0.97)] / 2 L = [(-log(0.5)*-0.4 - log(0.5)*0.97) + (-log(0.5)*-1.0 - log(0.5)*0.97)] / 2 L = [(0.69315*-0.4 - -0.69315*0.97) + (0.69315*-1.0 + 0.69315*0.97)] / 2 L = (0.3950955 + -0.0207945) / 2 L = 0.1871505 """ expected_alpha_loss = 0.1871505 check(out[5].numpy(), expected_alpha_loss, decimals=3)
def test_2x2_grid_world_with_2_actors(self): """ Tests a minimalistic 2x2 GridWorld with two Actors. """ env = GridWorld(world="2x2", actors=2) # Simple test runs with fixed actions. # X=player's position env.reset_all() # ["XH", " G"] X=player's position env.act(np.array([2, 1])) # down: [" H", "XG"], # right: [" X", " G"] check(env.state, [1, 0]) check(env.reward, [-0.1, -5.0]) check(env.terminal, [False, True]) env.act(np.array([1, 2])) # right: [" H", " X"], # down: [" H", "XG"] check(env.state, [0, 1]) # 0=state got already reset (flow envs). check(env.reward, [1.0, -0.1]) check(env.terminal, [True, False]) env.reset_all() env.act(np.array( [1, 1])) # both Actors move right: [" X", " G"] -> in the hole check(env.state, [0, 0]) check(env.reward, [-5.0, -5.0]) check(env.terminal, [True, True]) # Run against a wall. env.act(np.array([3, 0])) # left: ["XH", " G"], up: ["XH", " G"] check(env.state, [0, 0]) check(env.reward, [-0.1, -0.1]) check(env.terminal, [False, False]) env.act(np.array([2, 0])) # down: [" H", "XG"], up: ["XH", " G"] check(env.state, [1, 0]) check(env.reward, [-0.1, -0.1]) check(env.terminal, [False, False]) env.act(np.array([0, 2])) # up: ["XH", " G"], down: [" H", "XG"] check(env.state, [0, 1]) check(env.reward, [-0.1, -0.1]) check(env.terminal, [False, False]) env.act(np.array([1, 1])) # right: [" X", " G"], right: [" H", " X"] check(env.state, [0, 0]) check(env.reward, [-5.0, 1.0]) check(env.terminal, [True, True]) env.terminate()
def test_dqn2015_functionality(self): # Fake q-net/qt-net used for this test. def q(s, a): return np.sum(dense(dense(s, weights_q[0], weights_q[1]), weights_q[2], weights_q[3]) * one_hot(a, depth=4), axis=-1) def qt(s): return dense(dense(s, weights_qt[0], weights_qt[1]), weights_qt[2], weights_qt[3]) env = GridWorld("2x2", actors=1) state_space = env.actors[0].state_space.with_batch() action_space = env.actors[0].action_space.with_batch() # Add the preprocessor. preprocessor = Preprocessor( lambda inputs_: tf.one_hot(inputs_, depth=state_space.num_categories) ) preprocessed_space = preprocessor(state_space) # Add the Q-network. i = K.layers.Input(shape=preprocessed_space.shape, dtype=preprocessed_space.dtype) o = K.layers.Dense(2, activation="linear")(i) # keep it very simple # o = K.layers.Dense(256)(o) q_network = K.Model(inputs=i, outputs=o) # Create a very simple DQN2015. dqn = DQN2015(config=DQN2015Config.make( "{}/../configs/dqn2015_grid_world_2x2_functionality.json".format(os.path.dirname(__file__)), preprocessor=preprocessor, q_network=q_network, state_space=state_space, action_space=action_space ), name="my-dqn") # Check slot of "x" in flattened mem. check(dqn.memory.next_record_setup["x"][1], [3]) self.assertTrue(dqn.memory.batch_size is None) check(dqn.Q.get_weights(), dqn.Qt.get_weights()) # Point actor(s) to the algo. env.point_all_actors_to_algo(dqn) # Set our weights fixed. weights = [ np.array([[0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4]]), # hidden layer kernel np.array([0.0, 0.0]), # hidden layer bias np.array([[-0.4, -0.3, -0.2, -0.1], [0.4, 0.3, 0.2, 0.1]]), # output layer kernel np.array([0.1, 0.1, 1.0, 0.0]) # output layer bias ] dqn.Q.set_weights(weights) # Perform one step in the env. expected_action = np.argmax(dqn.Q(dqn.Phi(env.state)), axis=-1) check(expected_action, 2) # expect to go down env.run(ticks=1) # ts=0 -> do nothing # Check action taken. check(dqn.a.value, expected_action) # Check state of the env after action taken. check(env.state[0], 1) check(env.reward[0], -0.1) check(env.terminal[0], False) # Check memory of dqn (after one time step, should still be empty). check(dqn.memory.size, 0) self.assertTrue(dqn.memory.batch_size is None) # Perform one step in the env. expected_action = np.argmax(dqn.Q(dqn.Phi(env.state)), axis=-1) check(expected_action, 2) # expect to go down env.run(ticks=1) # ts=1 -> no sync, no update # Check action taken. check(dqn.a.value, expected_action) # Check state of the env after action taken. check(env.state[0], 1) check(env.reward[0], -0.1) check(env.terminal[0], False) # Check memory of dqn. check(dqn.memory.size, 1) self.assertTrue(dqn.memory.batch_size == 1) # batch_size is now established. check(dqn.memory.memory, [ np.array([2, 0, 0, 0]), np.array([-0.1, 0., 0., 0.]), np.array([False, False, False, False]), np.array([[1., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]]) ]) # Check next states. check(dqn.memory.next_records, [[np.array([[0., 1., 0., 0.]])]]) # Perform one step in the env. # What are the weights after the update? weights_q_before_update = dqn.Q.get_weights() weights_q = copy.deepcopy(weights_q_before_update) weights_qt = dqn.Qt.get_weights() # Check action taken (action is picked before! update). expected_action = np.argmax(dqn.Q(dqn.Phi(np.array([1]))), axis=-1) env.run(ticks=1) # ts=2 -> no sync, do update weights_q_after_update = dqn.Q.get_weights() check(dqn.a.value, expected_action) # Check new weight values after the update. loss = DQN2015Loss()(dqn.memory.last_records_pulled, q, qt, dqn.config) for i, matrix in enumerate(weights_q_before_update): for idx in np.ndindex(matrix.shape): weights_q = copy.deepcopy(weights_q_before_update) weights_q[i][idx] += 0.0001 lossd = DQN2015Loss()(dqn.memory.last_records_pulled, q, qt, dqn.config) dL_over_dw = (lossd - loss) / 0.0001 check(weights_q_after_update[i][idx], weights_q_before_update[i][idx] - dL_over_dw * dqn.optimizer.learning_rate(0.0), decimals=3) # Check state of the env after action taken. check(env.state[0], 1) check(env.reward[0], -0.1) check(env.terminal[0], False) # Check memory of dqn. check(dqn.memory.size, 2) check(dqn.memory.memory, [ np.array([2, 2, 0, 0]), np.array([-0.1, -0.1, 0., 0.]), np.array([False, False, False, False]), np.array([[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]]) ]) # Check next states. check(dqn.memory.next_records, [[np.array([[0., 1., 0., 0.]])]]) env.terminate()
def test_linear_decay(self): linear_decay = LinearDecay.make({"from": 2.0, "to": 0.5}) input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23]) out = linear_decay(input_) check(out, 2.0 - input_ * (2.0 - 0.5))
def test_linear_decay_with_step_function(self): linear_decay = LinearDecay.make({"from": 2.0, "to": 0.5, "begin_time_percentage": 0.5, "end_time_percentage": 0.6}) input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23, 0.51, 0.52, 0.55, 0.59]) out = linear_decay(input_) check(out, np.array([2.0, 2.0, 0.5, 0.5, 2.0, 2.0, 0.5, 2.0, 1.85, 1.7, 1.25, 0.65]))
def test_update_records(self): memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=self.capacity) # Insert record samples. num_records = 2 data = self.record_space.sample(num_records) memory.add_records(data) self.assertTrue(memory.size == num_records) self.assertTrue(memory.index == num_records) # Fetch records, their indices and weights. batch, indices, weights = memory.get_records_with_indices_and_weights(num_records) check(weights, np.ones(shape=(num_records,))) self.assertEqual(num_records, len(indices)) self.assertTrue(memory.size == num_records) self.assertTrue(memory.index == num_records) # Update weight of index 0 to very small. memory.update_records(np.array([0]), np.array([0.01])) # Expect to sample almost only index 1 (which still has a weight of 1.0). for _ in range(100): _, indices, weights = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 970) # Update weight of index 1 to very small as well. # Expect to sample equally. for _ in range(100): rand = np.random.random() memory.update_records(np.array([0, 1]), np.array([rand, rand])) _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 400) self.assertLessEqual(np.sum(indices), 600) # Update weights to be 1:2. # Expect to sample double as often index 1 over index 0 (1.0 = 2* 0.5). for _ in range(100): rand = np.random.random() * 10 memory.update_records(np.array([0, 1]), np.array([rand, rand * 2])) _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 600) self.assertLessEqual(np.sum(indices), 750) # Update weights to be 1:4. # Expect to sample quadruple as often index 1 over index 0. for _ in range(100): rand = np.random.random() * 10 memory.update_records(np.array([0, 1]), np.array([rand, rand * 4])) _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 750) self.assertLessEqual(np.sum(indices), 850) # Update weights to be 1:9. # Expect to sample 9 times as often index 1 over index 0. for _ in range(100): rand = np.random.random() * 10 memory.update_records(np.array([0, 1]), np.array([rand, rand * 9])) _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 850) self.assertLessEqual(np.sum(indices), 950) # Insert more record samples. num_records = 10 data = self.record_space.sample(num_records) memory.add_records(data) self.assertTrue(memory.size == self.capacity) self.assertTrue(memory.index == 2) # Update weights to be 1.0 to 10.0 and sample a < 10 batch. memory.update_records(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([0.1, 1., 3., 8., 16., 32., 64., 128., 256., 512.])) counts = Counter() for _ in range(1000): _, indices, _ = memory.get_records_with_indices_and_weights(num_records=np.random.randint(1, 6)) for i in indices: counts[i] += 1 print(counts) self.assertTrue( counts[9] >= counts[8] >= counts[7] >= counts[6] >= counts[5] >= counts[4] >= counts[3] >= counts[2] >= counts[1] >= counts[0] )
def test_polynomial_parameter(self): polynomial_decay = Decay.make(type="polynomial-decay", from_=2.0, to_=0.5, power=2.0) input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23]) out = polynomial_decay(input_) check(out, (2.0 - 0.5) * (1.0 - input_) ** 2 + 0.5)
def test_exponential_parameter(self): exponential_decay = Decay.make(type="exponential-decay", from_=2.0, to_=0.5, decay_rate=0.5) input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23]) out = exponential_decay(input_) check(out, 0.5 + (2.0 - 0.5) * 0.5 ** input_)