def __init__(self, config, name=None): super().__init__(config, name) self.inference = False # True=planning mode. False="supervised+intrinsic-reward+model-learning" mode. self.he = 0 # Current step within He (total episode horizon). self.hz = 0 # Current step within Hz (repeat horizon for one selected skill) self.preprocessor = Preprocessor.make(config.preprocessor) self.s = self.preprocessor( config.state_space.with_batch()) # preprocessed states self.a = config.action_space.with_batch() # actions (a) self.ri = Float(main_axes=[("Episode Horizon", config.episode_horizon) ]) # intrinsic rewards in He self.z = Float(-1.0, 1.0, shape=(config.dim_skill_vectors,), main_axes="B") if \ config.discrete_skills is False else Int(config.dim_skill_vectors, main_axes="B") self.s_and_z = Dict(dict(s=self.s, z=self.z), main_axes="B") self.pi = Network.make(input_space=self.s_and_z, output_space=self.a, **config.policy_network) self.q = Network.make(input_space=self.s_and_z, output_space=self.s, distributions=dict( type="mixture", num_experts=config.num_q_experts), **config.q_network) self.B = FIFOBuffer(Dict(dict(s=self.s, z=self.z, a=self.a, t=bool)), config.episode_buffer_capacity, when_full=self.event_buffer_full, next_record_setup=dict(s="s_")) self.SAC = SAC(config=config.sac_config, name="SAC-level0") # Low-level SAC. self.q_optimizer = Optimizer.make( config.supervised_optimizer) # supervised model optimizer self.Lsup = NegLogLikelihoodLoss(distribution=MixtureDistribution( num_experts=config.num_q_experts)) self.preprocessor.reset()
def test_normal(self): # Create 5 normal distributions (2 parameters (mean and stddev) each). param_space = Tuple( Float(shape=(5, )), # mean Float(0.5, 1.0, shape=(5, )), # stddev main_axes="B") values_space = Float(shape=(5, ), main_axes="B") # The Component to test. normal = Normal() # Batch of size=2 and deterministic (True). input_ = param_space.sample(2) expected = input_[0] # 0 = mean # Sample n times, expect always mean value (deterministic draw). for _ in range(50): out = normal.sample(input_, deterministic=True) check(out, expected) normal.sample_deterministic(input_) check(out, expected) # Batch of size=1 and non-deterministic -> expect roughly the mean. input_ = param_space.sample(1) expected = input_[0][0] # 0 = mean outs = [] for _ in range(100): out = normal.sample(input_, deterministic=False) outs.append(out) out = normal.sample_stochastic(input_) outs.append(out) check(np.mean(outs), expected.mean(), decimals=1) means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0]]) log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 10.0]]) # The normal-adapter does this following line with the NN output (interpreted as log(stddev)): # Doesn't really matter here in this test case, though. stds = np.exp( np.clip(log_stds, a_min=MIN_LOG_NN_OUTPUT, a_max=MAX_LOG_NN_OUTPUT)) values = np.array([[1.0, 2.0, 0.4, 10.0, 5.4]]) # Test log-likelihood outputs. out = normal.log_prob((means, stds), values) expected_outputs = np.log(norm.pdf(values, means, stds)) check(out, expected_outputs) # Test entropy outputs. out = normal.entropy((means, stds)) # See: https://en.wikipedia.org/wiki/Normal_distribution#Maximum_entropy expected_entropy = 0.5 * (1 + np.log(2 * np.square(stds) * np.pi)) check(out, expected_entropy)
def test_categorical(self): # Create 5 categorical distributions of 3 categories each. param_space = Float(shape=(5, 3), low=-1.0, high=2.0, main_axes="B") values_space = Int(3, shape=(5, ), main_axes="B") # The Component to test. categorical = Categorical() # Batch of size=3 and deterministic (True). input_ = param_space.sample(3) expected = np.argmax(input_, axis=-1) # Sample n times, expect always max value (max likelihood for deterministic draw). for _ in range(10): out = categorical.sample(input_, deterministic=True) check(out, expected) out = categorical.sample_deterministic(input_) check(out, expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. input_ = param_space.sample(3) outs = [] for _ in range(100): out = categorical.sample(input_, deterministic=False) outs.append(out) out = categorical.sample_stochastic(input_) outs.append(out) check(np.mean(outs), 1.0, decimals=0) input_ = param_space.sample(1) probs = softmax(input_) values = values_space.sample(1) # Test log-likelihood outputs. out = categorical.log_prob(input_, values) check(out, np.log( np.array([[ probs[0][0][values[0][0]], probs[0][1][values[0][1]], probs[0][2][values[0][2]], probs[0][3][values[0][3]], probs[0][4][values[0][4]] ]])), decimals=4) # Test entropy outputs. out = categorical.entropy(input_) expected_entropy = -np.sum(probs * np.log(probs), axis=-1) check(out, expected_entropy)
def __init__(self, config, name=None): super().__init__(config, name) self.Phi = Preprocessor.make(config.preprocessor) self.x = self.Phi(Space.make( config.state_space).with_batch()) # preprocessed states (x) self.a = Space.make(config.action_space).with_batch() # actions (a) self.Q = Network.make( network=config.q_network, input_space=self.x, output_space=Dict( A=self.a, V=Float().with_batch()), # dueling network outputs adapters=dict(A=dict(pre_network=config.dueling_a_network), V=dict(pre_network=config.dueling_v_network))) self.Qt = self.Q.copy(trainable=False) self.memory = PrioritizedReplayBuffer.make( record_space=Dict(dict(s=self.x, a=self.a, r=float, t=bool, n=int), main_axes="B"), capacity=config.memory_capacity, alpha=config.memory_alpha, beta=config.memory_beta, next_record_setup=dict(s="s_", n_step=config.n_step)) self.n_step = NStep(config.gamma, n_step=config.n_step, n_step_only=True) # N-step component self.L = DDDQNLoss() # double/dueling/n-step Q-loss self.optimizer = Optimizer.make(self.config.optimizer) self.epsilon = Decay.make( self.config.epsilon) # for epsilon greedy learning self.Phi.reset() # make sure, Preprocessor is clean
def test_bernoulli(self): # Create 5 bernoulli distributions (or a multiple thereof if we use batch-size > 1). param_space = Float(-1.0, 1.0, shape=(5, ), main_axes="B") # The Component to test. bernoulli = Bernoulli() # Batch of size=6 and deterministic (True). input_ = param_space.sample(6) expected = sigmoid(input_) > 0.5 # Sample n times, expect always max value (max likelihood for deterministic draw). for _ in range(10): out = bernoulli.sample(input_, deterministic=True) check(out, expected) out = bernoulli.sample_deterministic(input_) check(out, expected) # Batch of size=6 and non-deterministic -> expect roughly the mean. input_ = param_space.sample(6) outs = [] for _ in range(100): out = bernoulli.sample(input_, deterministic=False) outs.append(out) out = bernoulli.sample_stochastic(input_) outs.append(out) check(np.mean(outs), 0.5, decimals=1) logits = np.array([[0.1, -0.2, 0.3, -4.4, 2.0]]) probs = sigmoid(logits) # Test log-likelihood outputs. values = np.array([[True, False, False, True, True]]) out = bernoulli.log_prob(logits, values=values) expected_log_probs = np.log(np.where(values, probs, 1.0 - probs)) check(out, expected_log_probs) # Test entropy outputs. # Binary Entropy with natural log. expected_entropy = -(probs * np.log(probs)) - ( (1.0 - probs) * np.log(1.0 - probs)) out = bernoulli.entropy(logits) check(out, expected_entropy)
def test_gumbel_softmax_distribution(self): # 5-categorical Gumble-Softmax. param_space = Float(shape=(5, ), main_axes="B") values_space = Float(shape=(5, ), main_axes="B") gumble_softmax_distribution = GumbelSoftmax(temperature=1.0) # Batch of size=2 and deterministic (True). input_ = param_space.sample(2) expected = softmax(input_) # Sample n times, expect always argmax value (deterministic draw). for _ in range(50): out = gumble_softmax_distribution.sample(input_, deterministic=True) check(out, expected) out = gumble_softmax_distribution.sample_deterministic(input_) check(out, expected) # Batch of size=1 and non-deterministic -> expect roughly the vector of probs. input_ = param_space.sample(1) expected = softmax(input_) outs = [] for _ in range(100): out = gumble_softmax_distribution.sample(input_) outs.append(out) out = gumble_softmax_distribution.sample_stochastic(input_) outs.append(out) check(np.mean(outs, axis=0), expected, decimals=1) return # TODO: Figure out Gumbel Softmax log-prob calculation (our current implementation does not correspond with paper's formula). def gumbel_log_density(y, probs, num_categories, temperature=1.0): # https://arxiv.org/pdf/1611.01144.pdf. density = np.math.factorial(num_categories - 1) * np.math.pow(temperature, num_categories - 1) * \ (np.sum(probs / np.power(y, temperature), axis=-1) ** -num_categories) * \ np.prod(probs / np.power(y, temperature + 1.0), axis=-1) return np.log(density) # Test log-likelihood outputs. input_ = param_space.sample(3) values = values_space.sample(3) expected = gumbel_log_density(values, softmax(input_), num_categories=param_space.shape[0]) out = gumble_softmax_distribution.log_prob(input_, values) check(out, expected)
def test_multivariate_normal(self): # Create batch0=n (batch-rank), batch1=2 (can be used for m mixed Gaussians), num-events=3 (trivariate) # distributions (2 parameters (mean and stddev) each). num_events = 3 # 3=trivariate Gaussian num_mixed_gaussians = 2 # 2x trivariate Gaussians (mixed) param_space = Tuple( Float(shape=(num_mixed_gaussians, num_events)), # mean Float(0.5, 1.0, shape=(num_mixed_gaussians, num_events)), # diag (variance) main_axes="B") values_space = Float(shape=(num_mixed_gaussians, num_events), main_axes="B") # The Component to test. distribution = MultivariateNormal() input_ = param_space.sample(4) expected = input_[0] # 0=mean # Sample n times, expect always mean value (deterministic draw). for _ in range(50): out = distribution.sample(input_, deterministic=True) check(out, expected) out = distribution.sample_deterministic(input_) check(out, expected) # Batch of size=1 and non-deterministic -> expect roughly the mean. input_ = param_space.sample(1) expected = input_[0] # 0=mean outs = [] for _ in range(100): out = distribution.sample(input_, deterministic=False) outs.append(out) out = distribution.sample_stochastic(input_) outs.append(out) check(np.mean(outs), expected.mean(), decimals=1) means = values_space.sample(2) stds = values_space.sample(2) values = values_space.sample(2) # Test log-likelihood outputs (against scipy). out = distribution.log_prob((means, stds), values) # Sum up the individual log-probs as we have a diag (independent) covariance matrix. check(out, np.sum(np.log(norm.pdf(values, means, stds)), axis=-1), decimals=4)
def test_beta(self): # Create 5 beta distributions (2 parameters (alpha and beta) each). param_space = Tuple( Float(shape=(5, )), # alpha Float(shape=(5, )), # beta main_axes="B") values_space = Float(shape=(5, ), main_axes="B") # The Component to test. low, high = -1.0, 2.0 beta_distribution = Beta(low=low, high=high) # Batch of size=2 and deterministic (True). input_ = param_space.sample(2) # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] expected = (1.0 / (1.0 + input_[1] / input_[0])) * (high - low) + low # Sample n times, expect always mean value (deterministic draw). for _ in range(100): out = beta_distribution.sample(input_, deterministic=True) check(out, expected) out = beta_distribution.sample_deterministic(input_) check(out, expected) # Batch of size=1 and non-deterministic -> expect roughly the mean. input_ = param_space.sample(1) expected = (1.0 / (1.0 + input_[1] / input_[0])) * (high - low) + low outs = [] for _ in range(100): out = beta_distribution.sample(input_, deterministic=False) outs.append(out) out = beta_distribution.sample_stochastic(input_) outs.append(out) check(np.mean(outs), expected.mean(), decimals=1) alpha_ = values_space.sample(1) beta_ = values_space.sample(1) values = values_space.sample(1) values_scaled = values * (high - low) + low # Test log-likelihood outputs (against scipy). out = beta_distribution.log_prob((alpha_, beta_), values_scaled) check(out, np.log(beta.pdf(values, alpha_, beta_)), decimals=4) # TODO: Test entropy outputs (against scipy). out = beta_distribution.entropy((alpha_, beta_))
class TestMemoriesGenerically(unittest.TestCase): """ Tests different generic functionalities of Memories. """ record_space = Dict( states=dict(state1=float, state2=Float(shape=(2,))), actions=dict(action1=int), reward=float, terminals=bool, main_axes="B" ) record_space_no_next_state = Dict(s=dict(s1=float, s2=float), a=dict(a1=Int(10)), r=float, t=Bool(), main_axes="B") capacity = 10 alpha = 1.0 beta = 1.0 max_priority = 1.0 def test_next_state_handling(self): """ Tests if next-states can be stored efficiently (not using any space!) in the memory. NOTE: The memory does not care about terminal signals, it will always return the n-next-in-memory state regardless of whether this is a useful state (terminal=False) or not (terminal=True). In case of a terminal=True, the next state (whether it be the true terminal state, the reset state, or any other random state) does not matter anyway. """ capacity = 10 batch_size = 2 # Test all classes of memories. for class_ in [ReplayBuffer, PrioritizedReplayBuffer]: memory = class_(record_space=self.record_space_no_next_state, capacity=capacity, next_record_setup=dict(s="s_")) # Insert n records (inserts must always be batch-size). data = dict( s=dict(s1=np.array([0.0, 1.0]), s2=np.array([2.0, 3.0])), a=np.array([0, 1]), r=np.array([-0.0, -1.0]), t=np.array([False, True]), s_=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])) ) memory.add_records(data) # Check, whether inserting the wrong batch size raises Exception. try: data = self.record_space_no_next_state.sample(batch_size + 1) data["s_"] = self.record_space_no_next_state["s"].sample(batch_size) memory.add_records(data) assert False, "ERROR: Should not get here. Error is expected." except SurrealError: pass # Assert we can now fetch n elements. retrieved_data = memory.get_records(num_records=1) self.assertEqual(1, len(retrieved_data["t"])) # Check the next state. if retrieved_data["s"]["s1"][0] == 0.0: self.assertTrue(retrieved_data["s_"]["s1"] == 0.1 and retrieved_data["s_"]["s2"] == 2.1) else: self.assertTrue(retrieved_data["s"]["s1"] == 1.0) self.assertTrue(retrieved_data["s_"]["s1"] == 1.1 and retrieved_data["s_"]["s2"] == 3.1) # Insert another 2xn records and then check for correct next-state returns when getting records. data = dict( s=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])), a=np.array([2, 3]), r=np.array([-2.0, -3.0]), t=np.array([False, False]), s_=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])) ) memory.add_records(data) data = dict( s=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])), a=np.array([4, 5]), r=np.array([-4.0, -5.0]), t=np.array([True, True]), s_=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])) ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=2) self.assertEqual(2, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(2): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1) self.assertTrue(memory.size == 6) # Insert up to capacity and check again. data = dict( s=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])), a=np.array([6, 7]), r=np.array([-6.0, -7.0]), t=np.array([True, False]), s_=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])) ) memory.add_records(data) data = dict( s=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])), a=np.array([8, 9]), r=np.array([-8.0, -9.0]), t=np.array([False, False]), s_=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])) ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=3) self.assertEqual(3, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(3): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1) self.assertTrue(memory.size == 10) # Go a little bit (one batch) over capacity and check again. data = dict( s=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])), a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([True, True]), s_=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6])) ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=4) self.assertEqual(4, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(4): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1) self.assertTrue(memory.size == 10) def test_next_state_handling_with_n_step(self): """ Tests if next-states can be stored efficiently (not using any space!) in the memory using an n-step memory. NOTE: The memory does not care about terminal signals, it will always return the n-next-in-memory state regardless of whether this is a useful state (terminal=False) or not (terminal=True). In case of a terminal=True, the next state (whether it be the true terminal state, the reset state, or any other random state) does not matter anyway. """ capacity = 10 batch_size = 2 # Test all classes of memories. for class_ in [ReplayBuffer, PrioritizedReplayBuffer]: memory = class_(record_space=self.record_space_no_next_state, capacity=capacity, next_record_setup=dict(s="s_", n_step=3)) # Insert n records (inserts must always be batch-size). data = dict( s=dict(s1=np.array([0.0, 1.0]), s2=np.array([2.0, 3.0])), a=np.array([0, 1]), r=np.array([-0.0, -1.0]), t=np.array([False, True]), s_=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])) # s' is now the n-step s' ) memory.add_records(data) # Check, whether inserting the wrong batch size raises Exception. try: data = self.record_space_no_next_state.sample(batch_size + 1) data["s_"] = self.record_space_no_next_state["s"].sample(batch_size) memory.add_records(data) assert False, "ERROR: Should not get here. Error is expected." except SurrealError: pass # Assert we cannot pull samples yet. n-step is 3, so we need at least 3 elements in memory. try: memory.get_records(num_records=1) assert False, "ERROR: Should not get here. Error is expected." except SurrealError: pass # Insert another 2xn records and then check for correct next-state returns when getting records. data = dict( s=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])), a=np.array([2, 3]), r=np.array([-2.0, -3.0]), t=np.array([False, False]), s_=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])) # s' is now the n-step s' ) memory.add_records(data) data = dict( s=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])), a=np.array([4, 5]), r=np.array([-4.0, -5.0]), t=np.array([True, True]), s_=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])) # s' is now the n-step s' ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=2) self.assertEqual(2, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(2): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3) self.assertTrue(memory.size == 6) # Insert up to capacity and check again. data = dict( s=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])), a=np.array([6, 7]), r=np.array([-6.0, -7.0]), t=np.array([True, False]), s_=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6])) ) memory.add_records(data) data = dict( s=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])), a=np.array([8, 9]), r=np.array([-8.0, -9.0]), t=np.array([False, False]), s_=dict(s1=np.array([0.7, 1.7]), s2=np.array([2.7, 3.7])) ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=3) self.assertEqual(3, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(3): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3) self.assertTrue(memory.size == 10) # Go a little bit (two batches) over capacity and check again. data = dict( s=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])), a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([True, True]), s_=dict(s1=np.array([0.8, 1.8]), s2=np.array([2.8, 3.8])) ) memory.add_records(data) data = dict( s=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6])), a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([False, False]), s_=dict(s1=np.array([0.9, 1.9]), s2=np.array([2.9, 3.9])) ) memory.add_records(data) for _ in range(20): retrieved_data = memory.get_records(num_records=4) self.assertEqual(4, len(retrieved_data["t"])) # Check the next states (always 0.1 larger than state). for i in range(4): check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3) check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3) self.assertTrue(memory.size == 10)
class TestPrioritizedReplayBuffer(unittest.TestCase): """ Tests insertion and (weighted) sampling of the PrioritizedReplayBuffer Component. """ record_space = Dict( states=dict(state1=float, state2=Float(shape=(2,))), actions=dict(action1=int), reward=float, terminals=bool, main_axes="B" ) capacity = 10 alpha = 1.0 beta = 1.0 max_priority = 1.0 def test_insert(self): memory = PrioritizedReplayBuffer( record_space=self.record_space, capacity=self.capacity, alpha=self.alpha, beta=self.beta ) # Assert indices 0 before insert. self.assertEqual(memory.size, 0) self.assertEqual(memory.index, 0) # Insert single record (no batch rank). data = self.record_space.sample() memory.add_records(data) self.assertTrue(memory.size == 1) self.assertTrue(memory.index == 1) # Insert single record (w/ batch rank). data = self.record_space.sample(1) memory.add_records(data) self.assertTrue(memory.size == 2) self.assertTrue(memory.index == 2) # Insert batched records. data = self.record_space.sample(5) memory.add_records(data) self.assertTrue(memory.size == 7) self.assertTrue(memory.index == 7) # Insert over capacity. data = self.record_space.sample(100) memory.add_records(data) self.assertTrue(memory.size == 10) self.assertTrue(memory.index == 7) def test_update_records(self): memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=self.capacity) # Insert record samples. num_records = 2 data = self.record_space.sample(num_records) memory.add_records(data) self.assertTrue(memory.size == num_records) self.assertTrue(memory.index == num_records) # Fetch records, their indices and weights. batch, indices, weights = memory.get_records_with_indices_and_weights(num_records) check(weights, np.ones(shape=(num_records,))) self.assertEqual(num_records, len(indices)) self.assertTrue(memory.size == num_records) self.assertTrue(memory.index == num_records) # Update weight of index 0 to very small. memory.update_records(np.array([0]), np.array([0.01])) # Expect to sample almost only index 1 (which still has a weight of 1.0). for _ in range(100): _, indices, weights = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 970) # Update weight of index 1 to very small as well. # Expect to sample equally. for _ in range(100): rand = np.random.random() memory.update_records(np.array([0, 1]), np.array([rand, rand])) _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 400) self.assertLessEqual(np.sum(indices), 600) # Update weights to be 1:2. # Expect to sample double as often index 1 over index 0 (1.0 = 2* 0.5). for _ in range(100): rand = np.random.random() * 10 memory.update_records(np.array([0, 1]), np.array([rand, rand * 2])) _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 600) self.assertLessEqual(np.sum(indices), 750) # Update weights to be 1:4. # Expect to sample quadruple as often index 1 over index 0. for _ in range(100): rand = np.random.random() * 10 memory.update_records(np.array([0, 1]), np.array([rand, rand * 4])) _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 750) self.assertLessEqual(np.sum(indices), 850) # Update weights to be 1:9. # Expect to sample 9 times as often index 1 over index 0. for _ in range(100): rand = np.random.random() * 10 memory.update_records(np.array([0, 1]), np.array([rand, rand * 9])) _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000) self.assertGreaterEqual(np.sum(indices), 850) self.assertLessEqual(np.sum(indices), 950) # Insert more record samples. num_records = 10 data = self.record_space.sample(num_records) memory.add_records(data) self.assertTrue(memory.size == self.capacity) self.assertTrue(memory.index == 2) # Update weights to be 1.0 to 10.0 and sample a < 10 batch. memory.update_records(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([0.1, 1., 3., 8., 16., 32., 64., 128., 256., 512.])) counts = Counter() for _ in range(1000): _, indices, _ = memory.get_records_with_indices_and_weights(num_records=np.random.randint(1, 6)) for i in indices: counts[i] += 1 print(counts) self.assertTrue( counts[9] >= counts[8] >= counts[7] >= counts[6] >= counts[5] >= counts[4] >= counts[3] >= counts[2] >= counts[1] >= counts[0] ) def test_segment_tree_insert_values(self): """ Tests if segment tree inserts into correct positions. """ memory = PrioritizedReplayBuffer( record_space=self.record_space, capacity=self.capacity, alpha=self.alpha, beta=self.beta ) priority_capacity = 1 while priority_capacity < self.capacity: priority_capacity *= 2 sum_segment_values = memory.merged_segment_tree.sum_segment_tree.values min_segment_values = memory.merged_segment_tree.min_segment_tree.values self.assertEqual(sum(sum_segment_values), 0) self.assertEqual(sum(min_segment_values), float("inf")) self.assertEqual(len(sum_segment_values), 2 * priority_capacity) self.assertEqual(len(min_segment_values), 2 * priority_capacity) # Insert 1 Element. observation = self.record_space.sample(size=1) memory.add_records(observation) # Check insert positions # Initial insert is at priority capacity print(sum_segment_values) print(min_segment_values) start = priority_capacity while start >= 1: self.assertEqual(sum_segment_values[start], 1.0) self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2) # Insert another Element. observation = self.record_space.sample(size=1) memory.add_records(observation) # Index shifted 1 start = priority_capacity + 1 self.assertEqual(sum_segment_values[start], 1.0) self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2) while start >= 1: # 1 + 1 is 2 on the segment. self.assertEqual(sum_segment_values[start], 2.0) # min is still 1. self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2) def test_tree_insert(self): """ Tests inserting into the segment tree and querying segments. """ memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4 ) tree = memory.merged_segment_tree.sum_segment_tree tree.insert(2, 1.0) tree.insert(3, 3.0) self.assertTrue(np.isclose(tree.get_sum(), 4.0)) self.assertTrue(np.isclose(tree.get_sum(0, 2), 0.0)) self.assertTrue(np.isclose(tree.get_sum(0, 3), 1.0)) self.assertTrue(np.isclose(tree.get_sum(2, 3), 1.0)) self.assertTrue(np.isclose(tree.get_sum(2, -1), 1.0)) self.assertTrue(np.isclose(tree.get_sum(2, 4), 4.0)) def test_prefixsum_idx(self): """ Tests fetching the index corresponding to a prefix sum. """ memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4) tree = memory.merged_segment_tree.sum_segment_tree tree.insert(2, 1.0) tree.insert(3, 3.0) self.assertEqual(tree.index_of_prefixsum(0.0), 2) self.assertEqual(tree.index_of_prefixsum(0.5), 2) self.assertEqual(tree.index_of_prefixsum(0.99), 2) self.assertEqual(tree.index_of_prefixsum(1.01), 3) self.assertEqual(tree.index_of_prefixsum(3.0), 3) self.assertEqual(tree.index_of_prefixsum(4.0), 3) memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4) tree = memory.merged_segment_tree.sum_segment_tree tree.insert(0, 0.5) tree.insert(1, 1.0) tree.insert(2, 1.0) tree.insert(3, 3.0) self.assertEqual(tree.index_of_prefixsum(0.0), 0) self.assertEqual(tree.index_of_prefixsum(0.55), 1) self.assertEqual(tree.index_of_prefixsum(0.99), 1) self.assertEqual(tree.index_of_prefixsum(1.51), 2) self.assertEqual(tree.index_of_prefixsum(3.0), 3) self.assertEqual(tree.index_of_prefixsum(5.50), 3)
def test_joint_cumulative_distribution(self): param_space = Dict( { "a": Float(shape=(4, )), # 4-discrete "b": Dict({ "ba": Tuple([Float(shape=(3, )), Float(0.1, 1.0, shape=(3, ))]), # 3-variate normal "bb": Tuple([Float(shape=(2, )), Float(shape=(2, ))]), # beta -1 to 1 "bc": Tuple([Float(shape=(4, )), Float(0.1, 1.0, shape=(4, ))]), # normal (dim=4) }) }, main_axes="B") values_space = Dict( { "a": Int(4), "b": Dict({ "ba": Float(shape=(3, )), "bb": Float(shape=(2, )), "bc": Float(shape=(4, )) }) }, main_axes="B") low, high = -1.0, 1.0 cumulative_distribution = JointCumulativeDistribution( distributions={ "a": Categorical(), "b": { "ba": MultivariateNormal(), "bb": Beta(low=low, high=high), "bc": Normal() } }) # Batch of size=2 and deterministic (True). input_ = param_space.sample(2) input_["a"] = softmax(input_["a"]) expected_mean = { "a": np.argmax(input_["a"], axis=-1), "b": { "ba": input_["b"]["ba"][0], # [0]=Mean # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low "bb": (1.0 / (1.0 + input_["b"]["bb"][1] / input_["b"]["bb"][0])) * (high - low) + low, "bc": input_["b"]["bc"][0], } } # Sample n times, expect always mean value (deterministic draw). for _ in range(20): out = cumulative_distribution.sample(input_, deterministic=True) check(out, expected_mean) out = cumulative_distribution.sample_deterministic(input_) check(out, expected_mean) # Batch of size=1 and non-deterministic -> expect roughly the mean. input_ = param_space.sample(1) input_["a"] = softmax(input_["a"]) expected_mean = { "a": np.sum(input_["a"] * np.array([0, 1, 2, 3])), "b": { "ba": input_["b"]["ba"][0], # [0]=Mean # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low "bb": (1.0 / (1.0 + input_["b"]["bb"][1] / input_["b"]["bb"][0])) * (high - low) + low, "bc": input_["b"]["bc"][0], } } outs = [] for _ in range(500): out = cumulative_distribution.sample(input_) outs.append(out) out = cumulative_distribution.sample_stochastic(input_) outs.append(out) check(np.mean(np.stack([o["a"][0] for o in outs], axis=0), axis=0), expected_mean["a"], atol=0.3) check(np.mean(np.stack([o["b"]["ba"][0] for o in outs], axis=0), axis=0), expected_mean["b"]["ba"][0], decimals=1) check(np.mean(np.stack([o["b"]["bb"][0] for o in outs], axis=0), axis=0), expected_mean["b"]["bb"][0], decimals=1) check(np.mean(np.stack([o["b"]["bc"][0] for o in outs], axis=0), axis=0), expected_mean["b"]["bc"][0], decimals=1) # Test log-likelihood outputs. params = param_space.sample(1) params["a"] = softmax(params["a"]) # Make sure beta-values are within 0.0 and 1.0 for the numpy calculation (which doesn't have scaling). values = values_space.sample(1) log_prob_beta = np.log( beta.pdf(values["b"]["bb"], params["b"]["bb"][0], params["b"]["bb"][1])) # Now do the scaling for b/bb (beta values). values["b"]["bb"] = values["b"]["bb"] * (high - low) + low expected_log_llh = np.log(params["a"][0][values["a"][0]]) + \ np.sum(np.log(norm.pdf(values["b"]["ba"][0], params["b"]["ba"][0], params["b"]["ba"][1]))) + \ np.sum(log_prob_beta) + \ np.sum(np.log(norm.pdf(values["b"]["bc"][0], params["b"]["bc"][0], params["b"]["bc"][1]))) out = cumulative_distribution.log_prob(params, values) check(out, expected_log_llh, decimals=0)
def test_squashed_normal(self): param_space = Tuple(Float(-1.0, 1.0, shape=(5, )), Float(0.5, 1.0, shape=(5, )), main_axes="B") low, high = -2.0, 1.0 squashed_distribution = SquashedNormal(low=low, high=high) # Batch of size=2 and deterministic (True). input_ = param_space.sample(2) expected = ((np.tanh(input_[0]) + 1.0) / 2.0) * (high - low) + low # [0] = mean # Sample n times, expect always mean value (deterministic draw). for _ in range(50): out = squashed_distribution.sample(input_, deterministic=True) check(out, expected) out = squashed_distribution.sample_deterministic(input_) check(out, expected) # Batch of size=1 and non-deterministic -> expect roughly the mean. input_ = param_space.sample(1) expected = ((np.tanh(input_[0]) + 1.0) / 2.0) * (high - low) + low # [0] = mean outs = [] for _ in range(500): out = squashed_distribution.sample(input_, deterministic=False) outs.append(out) self.assertTrue(np.max(out) <= high) self.assertTrue(np.min(out) >= low) out = squashed_distribution.sample_stochastic(input_) outs.append(out) self.assertTrue(np.max(out) <= high) self.assertTrue(np.min(out) >= low) check(np.mean(outs), expected.mean(), decimals=1) means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]]) log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 10.0], [0.7, -0.3, 0.4, -0.9, 8.0]]) # The normal-adapter does this following line with the NN output (interpreted as log(stddev)): # Doesn't really matter here in this test case, though. stds = np.exp( np.clip(log_stds, a_min=MIN_LOG_NN_OUTPUT, a_max=MAX_LOG_NN_OUTPUT)) # Make sure values are within low and high. values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]]) # Test log-likelihood outputs. # TODO: understand and comment the following formula to get the log-prob. # Unsquash values, then get log-llh from regular gaussian. unsquashed_values = np.arctanh((values - low) / (high - low) * 2.0 - 1.0) log_prob_unsquashed = np.log(norm.pdf(unsquashed_values, means, stds)) log_prob = log_prob_unsquashed - np.sum( np.log(1 - np.tanh(unsquashed_values)**2), axis=-1, keepdims=True) out = squashed_distribution.log_prob((means, stds), values) check(out, log_prob) # Test entropy outputs. # TODO return
def test_mixture(self): # Create a mixture distribution consisting of 3 bivariate normals weighted by an internal # categorical distribution. num_distributions = 3 num_events_per_multivariate = 2 # 2=bivariate param_space = Dict( { "categorical": Float(shape=(num_distributions, ), low=-1.5, high=2.3), "parameters0": Tuple( Float(shape=(num_events_per_multivariate, )), # mean Float(shape=(num_events_per_multivariate, ), low=0.5, high=1.0), # diag ), "parameters1": Tuple( Float(shape=(num_events_per_multivariate, )), # mean Float(shape=(num_events_per_multivariate, ), low=0.5, high=1.0), # diag ), "parameters2": Tuple( Float(shape=(num_events_per_multivariate, )), # mean Float(shape=(num_events_per_multivariate, ), low=0.5, high=1.0), # diag ), }, main_axes="B") values_space = Float(shape=(num_events_per_multivariate, ), main_axes="B") # The Component to test. mixture = MixtureDistribution( # Try different spec types. MultivariateNormal(), "multi-variate-normal", "multivariate_normal") # Batch of size=n and deterministic (True). input_ = param_space.sample(1) # Make probs for categorical. categorical_probs = softmax(input_["categorical"]) # Note: Usually, the deterministic draw should return the max-likelihood value # Max-likelihood for a 3-Mixed Bivariate: mean-of-argmax(categorical)() # argmax = np.argmax(input_[0]["categorical"], axis=-1) #expected = np.array([input_[0]["parameters{}".format(idx)][0][i] for i, idx in enumerate(argmax)]) # input_[0]["categorical"][:, 1:2] * input_[0]["parameters1"][0] + \ # input_[0]["categorical"][:, 2:3] * input_[0]["parameters2"][0] # The mean value is a 2D vector (bivariate distribution). expected = categorical_probs[:, 0:1] * input_["parameters0"][0] + \ categorical_probs[:, 1:2] * input_["parameters1"][0] + \ categorical_probs[:, 2:3] * input_["parameters2"][0] for _ in range(20): out = mixture.sample(input_, deterministic=True) check(out, expected) out = mixture.sample_deterministic(input_) check(out, expected) # Batch of size=1 and non-deterministic -> expect roughly the mean. input_ = param_space.sample(1) # Make probs for categorical. categorical_probs = softmax(input_["categorical"]) expected = categorical_probs[:, 0:1] * input_["parameters0"][0] + \ categorical_probs[:, 1:2] * input_["parameters1"][0] + \ categorical_probs[:, 2:3] * input_["parameters2"][0] outs = [] for _ in range(500): out = mixture.sample(input_, deterministic=False) outs.append(out) out = mixture.sample_stochastic(input_) outs.append(out) check(np.mean(np.array(outs), axis=0), expected, decimals=1) return # TODO: prob/log-prob tests for Mixture. # Test log-likelihood outputs (against scipy). for i in range(20): params = param_space.sample(1) # Make sure categorical params are softmaxed. category_probs = softmax(params["categorical"][0]) values = values_space.sample(1) expected = 0.0 v = [] for j in range(3): v.append( multivariate_normal.pdf( values[0], mean=params["parameters{}".format(j)][0][0], cov=params["parameters{}".format(j)][1][0])) expected += category_probs[j] * v[-1] out = mixture.prob(params, values) check(out[0], expected, atol=0.1) expected = np.zeros(shape=(3, )) for j in range(3): expected[j] = np.log(category_probs[j]) + np.log( multivariate_normal.pdf( values[0], mean=params["parameters{}".format(j)][0][0], cov=params["parameters{}".format(j)][1][0])) expected = np.log(np.sum(np.exp(expected))) out = mixture.log_prob(params, values) print("{}: out={} expected={}".format(i, out, expected)) check(out, np.array([expected]), atol=0.25)
class DADS(RLAlgo): """ The DADS algorithm. [1] Dynamics-Aware Unsupervised Discovery of Skills - A. Sharma∗, S. Gu, S. Levine, V. Kumar, K. Hausman - Google Brain 2019 Compare to "Algorithm 1" and "Algorithm 2" pseudocodes in paper. """ def __init__(self, config, name=None): super().__init__(config, name) self.inference = False # True=planning mode. False="supervised+intrinsic-reward+model-learning" mode. self.he = 0 # Current step within He (total episode horizon). self.hz = 0 # Current step within Hz (repeat horizon for one selected skill) self.preprocessor = Preprocessor.make(config.preprocessor) self.s = self.preprocessor( config.state_space.with_batch()) # preprocessed states self.a = config.action_space.with_batch() # actions (a) self.ri = Float(main_axes=[("Episode Horizon", config.episode_horizon) ]) # intrinsic rewards in He self.z = Float(-1.0, 1.0, shape=(config.dim_skill_vectors,), main_axes="B") if \ config.discrete_skills is False else Int(config.dim_skill_vectors, main_axes="B") self.s_and_z = Dict(dict(s=self.s, z=self.z), main_axes="B") self.pi = Network.make(input_space=self.s_and_z, output_space=self.a, **config.policy_network) self.q = Network.make(input_space=self.s_and_z, output_space=self.s, distributions=dict( type="mixture", num_experts=config.num_q_experts), **config.q_network) self.B = FIFOBuffer(Dict(dict(s=self.s, z=self.z, a=self.a, t=bool)), config.episode_buffer_capacity, when_full=self.event_buffer_full, next_record_setup=dict(s="s_")) self.SAC = SAC(config=config.sac_config, name="SAC-level0") # Low-level SAC. self.q_optimizer = Optimizer.make( config.supervised_optimizer) # supervised model optimizer self.Lsup = NegLogLikelihoodLoss(distribution=MixtureDistribution( num_experts=config.num_q_experts)) self.preprocessor.reset() def update(self, samples, time_percentage): parameters = self.q(dict(s=samples["s"], z=samples["z"]), parameters_only=True) # Update for K1 (num_steps_per_supervised_update) iterations on same batch. weights = self.q.get_weights(as_ref=True) s_ = samples["s_"] if self.config.q_predicts_states_diff is False else \ tf.nest.map_structure(lambda s, s_: s_ - s, samples["s"], samples["s_"]) for _ in range(self.config.num_steps_per_supervised_update): loss = self.Lsup(parameters, s_) self.q_optimizer.apply_gradients(loss, weights, time_percentage=time_percentage) # Calculate intrinsic rewards. # Pull a batch of zs of size batch * (L - 1) (b/c 1 batch is the `z` of the sample (numerator's z)). batch_size = len(samples["s"]) zs = tf.concat([ samples["z"], self.z.sample(batch_size * (self.config.num_denominator_samples_for_ri - 1)) ]) s = tf.nest.map_structure( lambda s: tf.tile(s, [self.config.num_denominator_samples_for_ri] + ([1] * (len(s.shape) - 1))), samples["s"]) s_ = tf.nest.map_structure( lambda s: tf.tile(s, [self.config.num_denominator_samples_for_ri] + ([1] * (len(s.shape) - 1))), samples["s_"]) # Single (efficient) forward pass yielding s' likelihoods. all_s__llhs = tf.stack( tf.split(self.q(dict(s=s, z=zs), s_, likelihood=True), self.config.num_denominator_samples_for_ri)) r = tf.math.log(all_s__llhs[0] / tf.reduce_sum(all_s__llhs, axis=0)) + \ tf.math.log(self.config.num_denominator_samples_for_ri) # Update RL-algo's policy (same as π) from our batch (using intrinsic rewards). self.SAC.update( dict(s=samples["s"], z=samples["z"], a=samples["a"], r=r, s_=samples["s_"], t=samples["t"]), time_percentage) # When buffer full -> Update transition model q. def event_buffer_full(self, event): self.update(self.B.flush(), time_percentage=event.actor_time_steps / (self.config.max_time_steps or event.env.max_time_steps)) def event_episode_starts(self, event): # Initialize z if this hasn't happened yet. if self.z.value is None: self.z.assign(self.z.zeros(len(event.actor_slots))) # Sample new z at the trajectory's batch position. if self.inference is False: self.z.value[event.current_actor_slot] = self.z.sample( ) # Sample a new skill from Space z and store it in z (assume uniform). # Reset preprocessor at actor's batch position. self.preprocessor.reset(batch_position=event.current_actor_slot) # Fill the buffer with M samples. def event_tick(self, event): # Preprocess state. s_ = self.preprocessor(event.s_) ## If we are in inference mode -> do a planning step (rather than just act). #if self.inference: # self.he += 1 # if self.he >= self.config.He: # We have reached the end of the total episode horizon -> reset. # env.reset() # Send reset request to env. # return # self.plan(env.s) # # Execute selected skill for Hz steps. # if self.hz == self.config.Hz - 1: # zi = self.N.sample() # ?? ~ N[he/Hz] # hz = 0 # reset counter # hz += 1 #else: for i in event.actor_slots: if self.hz[i] >= self.config.skill_horizon: self.z.value[i] = self.z.sample() # Add single(!) szas't-tuple to buffer. if event.actor_time_steps > 0: self.B.add_records( dict(s=self.s.value, z=self.z.value, a=self.a.value, t=event.t, s_=event.s_)) # Query policy for an action. a_ = self.pi(dict(s=event.s_, z=self.z.value)) # Send the new action back to the env. event.env.act(a_) # Store action and state for next tick. self.s.assign(s_) self.a.assign(a_)
def __init__( self, *, policy_network, q_network, state_space, action_space, sac_config, num_q_experts=4, # 4 used in paper. q_predicts_states_diff=False, num_denominator_samples_for_ri=250, # 50-500 used in paper dim_skill_vectors=10, discrete_skills=False, episode_horizon=200, skill_horizon=None, preprocessor=None, supervised_optimizer=None, num_steps_per_supervised_update=1, episode_buffer_capacity=200, summaries=None): """ Args: policy_network (Network): The policy-network (pi) to use as a function approximator for the learnt policy. q_network (Network): The dynamics-network (q) to use as a function approximator for the learnt env dynamics. NOTE: Not to be confused with a Q-learning Q-net! In the paper, the dynamics function is called `q`, hence the same nomenclature here. state_space (Space): The state/observation Space. action_space (Space): The action Space. sac_config (SACConfig): The config for the internal SAC-Algo used to learn the skills using intrinsic rewards. num_q_experts (int): The number of experts used in the Mixture distribution output bz the q-network to predict the next state (s') given s (state) and z (skill vector). q_predicts_states_diff (bool): Whether the q-network predicts the different between s and s' rather than s' directly. Default: False. num_denominator_samples_for_ri (int): The number of samples to calculate for the denominator of the intrinsic reward function (`L` in the paper). dim_skill_vectors (int): The number of dimensions of the learnt skill vectors. discrete_skills (bool): Whether skill vectors are discrete (one-hot). episode_horizon (int): The episode horizon (He) to move within, when gathering episode samples. skill_horizon (Optional[int]): The horizon for which to use one skill vector (before sampling a new one). Default: Use value of `episode_horizon`. preprocessor (Preprocessor): The preprocessor (if any) to use. supervised_optimizer (Optimizer): The optimizer to use for the supervised (q) model learning task. num_steps_per_supervised_update (int): The number of gradient descent iterations per update (each iteration uses the same environment samples). episode_buffer_capacity (int): The capacity of the episode (experience) FIFOBuffer. summaries (List[any]): A list of summaries to produce if `UseTfSummaries` in debug.json is true. In the simplest case, this is a list of `self.[...]`-property names of the SAC object that should be tracked after each tick. """ # Clean up network configs to be passable as **kwargs to `make`. # Networks are given as sequential config or directly as Keras objects -> prepend "network" key to spec. if isinstance( policy_network, (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)): policy_network = dict(network=policy_network) if isinstance( q_network, (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)): q_network = dict(network=q_network) # Make state/action space. state_space = Space.make(state_space) action_space = Space.make(action_space) # Fix SAC config, add correct state- and action-spaces. sac_config = SACConfig.make( sac_config, state_space=Dict(s=state_space, z=Float(-1.0, 1.0, shape=(dim_skill_vectors, ))), action_space=action_space, # Use no memory. Updates are done from DADS' own buffer. memory_capacity=1, memory_batch_size=1, # Share policy network between DADS and underlying learning SAC. policy_network=policy_network) if skill_horizon is None: skill_horizon = episode_horizon super().__init__( locals()) # Config will store all c'tor variables automatically. # Keep track of which time-step stuff happened. Only important for by-time-step frequencies. self.last_update = 0