Exemplo n.º 1
0
    def __init__(self, config, name=None):
        super().__init__(config, name)
        self.inference = False  # True=planning mode. False="supervised+intrinsic-reward+model-learning" mode.
        self.he = 0  # Current step within He (total episode horizon).
        self.hz = 0  # Current step within Hz (repeat horizon for one selected skill)

        self.preprocessor = Preprocessor.make(config.preprocessor)
        self.s = self.preprocessor(
            config.state_space.with_batch())  # preprocessed states
        self.a = config.action_space.with_batch()  # actions (a)
        self.ri = Float(main_axes=[("Episode Horizon", config.episode_horizon)
                                   ])  # intrinsic rewards in He
        self.z = Float(-1.0, 1.0, shape=(config.dim_skill_vectors,), main_axes="B") if \
            config.discrete_skills is False else Int(config.dim_skill_vectors, main_axes="B")
        self.s_and_z = Dict(dict(s=self.s, z=self.z), main_axes="B")
        self.pi = Network.make(input_space=self.s_and_z,
                               output_space=self.a,
                               **config.policy_network)
        self.q = Network.make(input_space=self.s_and_z,
                              output_space=self.s,
                              distributions=dict(
                                  type="mixture",
                                  num_experts=config.num_q_experts),
                              **config.q_network)
        self.B = FIFOBuffer(Dict(dict(s=self.s, z=self.z, a=self.a, t=bool)),
                            config.episode_buffer_capacity,
                            when_full=self.event_buffer_full,
                            next_record_setup=dict(s="s_"))
        self.SAC = SAC(config=config.sac_config,
                       name="SAC-level0")  # Low-level SAC.
        self.q_optimizer = Optimizer.make(
            config.supervised_optimizer)  # supervised model optimizer
        self.Lsup = NegLogLikelihoodLoss(distribution=MixtureDistribution(
            num_experts=config.num_q_experts))
        self.preprocessor.reset()
Exemplo n.º 2
0
    def test_normal(self):
        # Create 5 normal distributions (2 parameters (mean and stddev) each).
        param_space = Tuple(
            Float(shape=(5, )),  # mean
            Float(0.5, 1.0, shape=(5, )),  # stddev
            main_axes="B")
        values_space = Float(shape=(5, ), main_axes="B")

        # The Component to test.
        normal = Normal()

        # Batch of size=2 and deterministic (True).
        input_ = param_space.sample(2)
        expected = input_[0]  # 0 = mean
        # Sample n times, expect always mean value (deterministic draw).
        for _ in range(50):
            out = normal.sample(input_, deterministic=True)
            check(out, expected)
            normal.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(1)
        expected = input_[0][0]  # 0 = mean
        outs = []
        for _ in range(100):
            out = normal.sample(input_, deterministic=False)
            outs.append(out)
            out = normal.sample_stochastic(input_)
            outs.append(out)

        check(np.mean(outs), expected.mean(), decimals=1)

        means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0]])
        log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 10.0]])
        # The normal-adapter does this following line with the NN output (interpreted as log(stddev)):
        # Doesn't really matter here in this test case, though.
        stds = np.exp(
            np.clip(log_stds, a_min=MIN_LOG_NN_OUTPUT,
                    a_max=MAX_LOG_NN_OUTPUT))
        values = np.array([[1.0, 2.0, 0.4, 10.0, 5.4]])

        # Test log-likelihood outputs.
        out = normal.log_prob((means, stds), values)
        expected_outputs = np.log(norm.pdf(values, means, stds))
        check(out, expected_outputs)

        # Test entropy outputs.
        out = normal.entropy((means, stds))
        # See: https://en.wikipedia.org/wiki/Normal_distribution#Maximum_entropy
        expected_entropy = 0.5 * (1 + np.log(2 * np.square(stds) * np.pi))
        check(out, expected_entropy)
Exemplo n.º 3
0
    def test_categorical(self):
        # Create 5 categorical distributions of 3 categories each.
        param_space = Float(shape=(5, 3), low=-1.0, high=2.0, main_axes="B")
        values_space = Int(3, shape=(5, ), main_axes="B")

        # The Component to test.
        categorical = Categorical()

        # Batch of size=3 and deterministic (True).
        input_ = param_space.sample(3)
        expected = np.argmax(input_, axis=-1)
        # Sample n times, expect always max value (max likelihood for deterministic draw).
        for _ in range(10):
            out = categorical.sample(input_, deterministic=True)
            check(out, expected)
            out = categorical.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=3 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(3)
        outs = []
        for _ in range(100):
            out = categorical.sample(input_, deterministic=False)
            outs.append(out)
            out = categorical.sample_stochastic(input_)
            outs.append(out)

        check(np.mean(outs), 1.0, decimals=0)

        input_ = param_space.sample(1)
        probs = softmax(input_)
        values = values_space.sample(1)

        # Test log-likelihood outputs.
        out = categorical.log_prob(input_, values)
        check(out,
              np.log(
                  np.array([[
                      probs[0][0][values[0][0]], probs[0][1][values[0][1]],
                      probs[0][2][values[0][2]], probs[0][3][values[0][3]],
                      probs[0][4][values[0][4]]
                  ]])),
              decimals=4)

        # Test entropy outputs.
        out = categorical.entropy(input_)
        expected_entropy = -np.sum(probs * np.log(probs), axis=-1)
        check(out, expected_entropy)
Exemplo n.º 4
0
 def __init__(self, config, name=None):
     super().__init__(config, name)
     self.Phi = Preprocessor.make(config.preprocessor)
     self.x = self.Phi(Space.make(
         config.state_space).with_batch())  # preprocessed states (x)
     self.a = Space.make(config.action_space).with_batch()  # actions (a)
     self.Q = Network.make(
         network=config.q_network,
         input_space=self.x,
         output_space=Dict(
             A=self.a, V=Float().with_batch()),  # dueling network outputs
         adapters=dict(A=dict(pre_network=config.dueling_a_network),
                       V=dict(pre_network=config.dueling_v_network)))
     self.Qt = self.Q.copy(trainable=False)
     self.memory = PrioritizedReplayBuffer.make(
         record_space=Dict(dict(s=self.x, a=self.a, r=float, t=bool, n=int),
                           main_axes="B"),
         capacity=config.memory_capacity,
         alpha=config.memory_alpha,
         beta=config.memory_beta,
         next_record_setup=dict(s="s_", n_step=config.n_step))
     self.n_step = NStep(config.gamma,
                         n_step=config.n_step,
                         n_step_only=True)  # N-step component
     self.L = DDDQNLoss()  # double/dueling/n-step Q-loss
     self.optimizer = Optimizer.make(self.config.optimizer)
     self.epsilon = Decay.make(
         self.config.epsilon)  # for epsilon greedy learning
     self.Phi.reset()  # make sure, Preprocessor is clean
Exemplo n.º 5
0
    def test_bernoulli(self):
        # Create 5 bernoulli distributions (or a multiple thereof if we use batch-size > 1).
        param_space = Float(-1.0, 1.0, shape=(5, ), main_axes="B")

        # The Component to test.
        bernoulli = Bernoulli()
        # Batch of size=6 and deterministic (True).
        input_ = param_space.sample(6)
        expected = sigmoid(input_) > 0.5
        # Sample n times, expect always max value (max likelihood for deterministic draw).
        for _ in range(10):
            out = bernoulli.sample(input_, deterministic=True)
            check(out, expected)
            out = bernoulli.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=6 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(6)
        outs = []
        for _ in range(100):
            out = bernoulli.sample(input_, deterministic=False)
            outs.append(out)
            out = bernoulli.sample_stochastic(input_)
            outs.append(out)

        check(np.mean(outs), 0.5, decimals=1)

        logits = np.array([[0.1, -0.2, 0.3, -4.4, 2.0]])
        probs = sigmoid(logits)

        # Test log-likelihood outputs.
        values = np.array([[True, False, False, True, True]])
        out = bernoulli.log_prob(logits, values=values)
        expected_log_probs = np.log(np.where(values, probs, 1.0 - probs))
        check(out, expected_log_probs)

        # Test entropy outputs.
        # Binary Entropy with natural log.
        expected_entropy = -(probs * np.log(probs)) - (
            (1.0 - probs) * np.log(1.0 - probs))
        out = bernoulli.entropy(logits)
        check(out, expected_entropy)
Exemplo n.º 6
0
    def test_gumbel_softmax_distribution(self):
        # 5-categorical Gumble-Softmax.
        param_space = Float(shape=(5, ), main_axes="B")
        values_space = Float(shape=(5, ), main_axes="B")

        gumble_softmax_distribution = GumbelSoftmax(temperature=1.0)

        # Batch of size=2 and deterministic (True).
        input_ = param_space.sample(2)
        expected = softmax(input_)
        # Sample n times, expect always argmax value (deterministic draw).
        for _ in range(50):
            out = gumble_softmax_distribution.sample(input_,
                                                     deterministic=True)
            check(out, expected)
            out = gumble_softmax_distribution.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=1 and non-deterministic -> expect roughly the vector of probs.
        input_ = param_space.sample(1)
        expected = softmax(input_)
        outs = []
        for _ in range(100):
            out = gumble_softmax_distribution.sample(input_)
            outs.append(out)
            out = gumble_softmax_distribution.sample_stochastic(input_)
            outs.append(out)

        check(np.mean(outs, axis=0), expected, decimals=1)

        return  # TODO: Figure out Gumbel Softmax log-prob calculation (our current implementation does not correspond with paper's formula).

        def gumbel_log_density(y, probs, num_categories, temperature=1.0):
            # https://arxiv.org/pdf/1611.01144.pdf.
            density = np.math.factorial(num_categories - 1) * np.math.pow(temperature, num_categories - 1) * \
                (np.sum(probs / np.power(y, temperature), axis=-1) ** -num_categories) * \
                np.prod(probs / np.power(y, temperature + 1.0), axis=-1)
            return np.log(density)

        # Test log-likelihood outputs.
        input_ = param_space.sample(3)
        values = values_space.sample(3)
        expected = gumbel_log_density(values,
                                      softmax(input_),
                                      num_categories=param_space.shape[0])

        out = gumble_softmax_distribution.log_prob(input_, values)
        check(out, expected)
Exemplo n.º 7
0
    def test_multivariate_normal(self):
        # Create batch0=n (batch-rank), batch1=2 (can be used for m mixed Gaussians), num-events=3 (trivariate)
        # distributions (2 parameters (mean and stddev) each).
        num_events = 3  # 3=trivariate Gaussian
        num_mixed_gaussians = 2  # 2x trivariate Gaussians (mixed)
        param_space = Tuple(
            Float(shape=(num_mixed_gaussians, num_events)),  # mean
            Float(0.5, 1.0,
                  shape=(num_mixed_gaussians, num_events)),  # diag (variance)
            main_axes="B")
        values_space = Float(shape=(num_mixed_gaussians, num_events),
                             main_axes="B")

        # The Component to test.
        distribution = MultivariateNormal()

        input_ = param_space.sample(4)
        expected = input_[0]  # 0=mean
        # Sample n times, expect always mean value (deterministic draw).
        for _ in range(50):
            out = distribution.sample(input_, deterministic=True)
            check(out, expected)
            out = distribution.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(1)
        expected = input_[0]  # 0=mean
        outs = []
        for _ in range(100):
            out = distribution.sample(input_, deterministic=False)
            outs.append(out)
            out = distribution.sample_stochastic(input_)
            outs.append(out)

        check(np.mean(outs), expected.mean(), decimals=1)

        means = values_space.sample(2)
        stds = values_space.sample(2)
        values = values_space.sample(2)

        # Test log-likelihood outputs (against scipy).
        out = distribution.log_prob((means, stds), values)
        # Sum up the individual log-probs as we have a diag (independent) covariance matrix.
        check(out,
              np.sum(np.log(norm.pdf(values, means, stds)), axis=-1),
              decimals=4)
Exemplo n.º 8
0
    def test_beta(self):
        # Create 5 beta distributions (2 parameters (alpha and beta) each).
        param_space = Tuple(
            Float(shape=(5, )),  # alpha
            Float(shape=(5, )),  # beta
            main_axes="B")
        values_space = Float(shape=(5, ), main_axes="B")

        # The Component to test.
        low, high = -1.0, 2.0
        beta_distribution = Beta(low=low, high=high)

        # Batch of size=2 and deterministic (True).
        input_ = param_space.sample(2)
        # Mean for a Beta distribution: 1 / [1 + (beta/alpha)]
        expected = (1.0 / (1.0 + input_[1] / input_[0])) * (high - low) + low
        # Sample n times, expect always mean value (deterministic draw).
        for _ in range(100):
            out = beta_distribution.sample(input_, deterministic=True)
            check(out, expected)
            out = beta_distribution.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(1)
        expected = (1.0 / (1.0 + input_[1] / input_[0])) * (high - low) + low
        outs = []
        for _ in range(100):
            out = beta_distribution.sample(input_, deterministic=False)
            outs.append(out)
            out = beta_distribution.sample_stochastic(input_)
            outs.append(out)

        check(np.mean(outs), expected.mean(), decimals=1)

        alpha_ = values_space.sample(1)
        beta_ = values_space.sample(1)
        values = values_space.sample(1)
        values_scaled = values * (high - low) + low

        # Test log-likelihood outputs (against scipy).
        out = beta_distribution.log_prob((alpha_, beta_), values_scaled)
        check(out, np.log(beta.pdf(values, alpha_, beta_)), decimals=4)

        # TODO: Test entropy outputs (against scipy).
        out = beta_distribution.entropy((alpha_, beta_))
Exemplo n.º 9
0
class TestMemoriesGenerically(unittest.TestCase):
    """
    Tests different generic functionalities of Memories.
    """
    record_space = Dict(
        states=dict(state1=float, state2=Float(shape=(2,))),
        actions=dict(action1=int),
        reward=float,
        terminals=bool,
        main_axes="B"
    )
    record_space_no_next_state = Dict(s=dict(s1=float, s2=float), a=dict(a1=Int(10)), r=float, t=Bool(), main_axes="B")

    capacity = 10
    alpha = 1.0
    beta = 1.0
    max_priority = 1.0

    def test_next_state_handling(self):
        """
        Tests if next-states can be stored efficiently (not using any space!) in the memory.

        NOTE: The memory does not care about terminal signals, it will always return the n-next-in-memory state
        regardless of whether this is a useful state (terminal=False) or not (terminal=True). In case of a
        terminal=True, the next state (whether it be the true terminal state, the reset state, or any other random
        state) does not matter anyway.
        """
        capacity = 10
        batch_size = 2

        # Test all classes of memories.
        for class_ in [ReplayBuffer, PrioritizedReplayBuffer]:
            memory = class_(record_space=self.record_space_no_next_state, capacity=capacity,
                            next_record_setup=dict(s="s_"))

            # Insert n records (inserts must always be batch-size).
            data = dict(
                s=dict(s1=np.array([0.0, 1.0]), s2=np.array([2.0, 3.0])),
                a=np.array([0, 1]), r=np.array([-0.0, -1.0]), t=np.array([False, True]),
                s_=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1]))
            )
            memory.add_records(data)

            # Check, whether inserting the wrong batch size raises Exception.
            try:
                data = self.record_space_no_next_state.sample(batch_size + 1)
                data["s_"] = self.record_space_no_next_state["s"].sample(batch_size)
                memory.add_records(data)
                assert False, "ERROR: Should not get here. Error is expected."
            except SurrealError:
                pass

            # Assert we can now fetch n elements.
            retrieved_data = memory.get_records(num_records=1)
            self.assertEqual(1, len(retrieved_data["t"]))

            # Check the next state.
            if retrieved_data["s"]["s1"][0] == 0.0:
                self.assertTrue(retrieved_data["s_"]["s1"] == 0.1 and retrieved_data["s_"]["s2"] == 2.1)
            else:
                self.assertTrue(retrieved_data["s"]["s1"] == 1.0)
                self.assertTrue(retrieved_data["s_"]["s1"] == 1.1 and retrieved_data["s_"]["s2"] == 3.1)

            # Insert another 2xn records and then check for correct next-state returns when getting records.
            data = dict(
                s=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])),
                a=np.array([2, 3]), r=np.array([-2.0, -3.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])),
                a=np.array([4, 5]), r=np.array([-4.0, -5.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=2)
                self.assertEqual(2, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(2):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 6)

            # Insert up to capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])),
                a=np.array([6, 7]), r=np.array([-6.0, -7.0]), t=np.array([True, False]),
                s_=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])),
                a=np.array([8, 9]), r=np.array([-8.0, -9.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=3)
                self.assertEqual(3, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(3):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 10)

            # Go a little bit (one batch) over capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])),
                a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=4)
                self.assertEqual(4, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(4):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 10)

    def test_next_state_handling_with_n_step(self):
        """
        Tests if next-states can be stored efficiently (not using any space!) in the memory using an n-step memory.

        NOTE: The memory does not care about terminal signals, it will always return the n-next-in-memory state
        regardless of whether this is a useful state (terminal=False) or not (terminal=True). In case of a
        terminal=True, the next state (whether it be the true terminal state, the reset state, or any other random
        state) does not matter anyway.
        """
        capacity = 10
        batch_size = 2
        # Test all classes of memories.
        for class_ in [ReplayBuffer, PrioritizedReplayBuffer]:
            memory = class_(record_space=self.record_space_no_next_state, capacity=capacity,
                            next_record_setup=dict(s="s_", n_step=3))

            # Insert n records (inserts must always be batch-size).
            data = dict(
                s=dict(s1=np.array([0.0, 1.0]), s2=np.array([2.0, 3.0])),
                a=np.array([0, 1]), r=np.array([-0.0, -1.0]), t=np.array([False, True]),
                s_=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3]))  # s' is now the n-step s'
            )
            memory.add_records(data)

            # Check, whether inserting the wrong batch size raises Exception.
            try:
                data = self.record_space_no_next_state.sample(batch_size + 1)
                data["s_"] = self.record_space_no_next_state["s"].sample(batch_size)
                memory.add_records(data)
                assert False, "ERROR: Should not get here. Error is expected."
            except SurrealError:
                pass

            # Assert we cannot pull samples yet. n-step is 3, so we need at least 3 elements in memory.
            try:
                memory.get_records(num_records=1)
                assert False, "ERROR: Should not get here. Error is expected."
            except SurrealError:
                pass

            # Insert another 2xn records and then check for correct next-state returns when getting records.
            data = dict(
                s=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])),
                a=np.array([2, 3]), r=np.array([-2.0, -3.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4]))  # s' is now the n-step s'
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])),
                a=np.array([4, 5]), r=np.array([-4.0, -5.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5]))  # s' is now the n-step s'
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=2)
                self.assertEqual(2, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(2):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3)

            self.assertTrue(memory.size == 6)

            # Insert up to capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])),
                a=np.array([6, 7]), r=np.array([-6.0, -7.0]), t=np.array([True, False]),
                s_=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])),
                a=np.array([8, 9]), r=np.array([-8.0, -9.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.7, 1.7]), s2=np.array([2.7, 3.7]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=3)
                self.assertEqual(3, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(3):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3)

            self.assertTrue(memory.size == 10)

            # Go a little bit (two batches) over capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])),
                a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.8, 1.8]), s2=np.array([2.8, 3.8]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6])),
                a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.9, 1.9]), s2=np.array([2.9, 3.9]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=4)
                self.assertEqual(4, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(4):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3)

            self.assertTrue(memory.size == 10)
Exemplo n.º 10
0
class TestPrioritizedReplayBuffer(unittest.TestCase):
    """
    Tests insertion and (weighted) sampling of the PrioritizedReplayBuffer Component.
    """
    record_space = Dict(
        states=dict(state1=float, state2=Float(shape=(2,))),
        actions=dict(action1=int),
        reward=float,
        terminals=bool,
        main_axes="B"
    )

    capacity = 10
    alpha = 1.0
    beta = 1.0
    max_priority = 1.0

    def test_insert(self):
        memory = PrioritizedReplayBuffer(
            record_space=self.record_space,
            capacity=self.capacity,
            alpha=self.alpha,
            beta=self.beta
        )

        # Assert indices 0 before insert.
        self.assertEqual(memory.size, 0)
        self.assertEqual(memory.index, 0)

        # Insert single record (no batch rank).
        data = self.record_space.sample()
        memory.add_records(data)
        self.assertTrue(memory.size == 1)
        self.assertTrue(memory.index == 1)

        # Insert single record (w/ batch rank).
        data = self.record_space.sample(1)
        memory.add_records(data)
        self.assertTrue(memory.size == 2)
        self.assertTrue(memory.index == 2)

        # Insert batched records.
        data = self.record_space.sample(5)
        memory.add_records(data)
        self.assertTrue(memory.size == 7)
        self.assertTrue(memory.index == 7)

        # Insert over capacity.
        data = self.record_space.sample(100)
        memory.add_records(data)
        self.assertTrue(memory.size == 10)
        self.assertTrue(memory.index == 7)

    def test_update_records(self):
        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=self.capacity)

        # Insert record samples.
        num_records = 2
        data = self.record_space.sample(num_records)
        memory.add_records(data)
        self.assertTrue(memory.size == num_records)
        self.assertTrue(memory.index == num_records)

        # Fetch records, their indices and weights.
        batch, indices, weights = memory.get_records_with_indices_and_weights(num_records)
        check(weights, np.ones(shape=(num_records,)))
        self.assertEqual(num_records, len(indices))
        self.assertTrue(memory.size == num_records)
        self.assertTrue(memory.index == num_records)

        # Update weight of index 0 to very small.
        memory.update_records(np.array([0]), np.array([0.01]))
        # Expect to sample almost only index 1 (which still has a weight of 1.0).
        for _ in range(100):
            _, indices, weights = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 970)

        # Update weight of index 1 to very small as well.
        # Expect to sample equally.
        for _ in range(100):
            rand = np.random.random()
            memory.update_records(np.array([0, 1]), np.array([rand, rand]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 400)
            self.assertLessEqual(np.sum(indices), 600)

        # Update weights to be 1:2.
        # Expect to sample double as often index 1 over index 0 (1.0 = 2* 0.5).
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 2]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 600)
            self.assertLessEqual(np.sum(indices), 750)

        # Update weights to be 1:4.
        # Expect to sample quadruple as often index 1 over index 0.
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 4]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 750)
            self.assertLessEqual(np.sum(indices), 850)

        # Update weights to be 1:9.
        # Expect to sample 9 times as often index 1 over index 0.
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 9]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 850)
            self.assertLessEqual(np.sum(indices), 950)

        # Insert more record samples.
        num_records = 10
        data = self.record_space.sample(num_records)
        memory.add_records(data)
        self.assertTrue(memory.size == self.capacity)
        self.assertTrue(memory.index == 2)

        # Update weights to be 1.0 to 10.0 and sample a < 10 batch.
        memory.update_records(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                              np.array([0.1, 1., 3., 8., 16., 32., 64., 128., 256., 512.]))
        counts = Counter()
        for _ in range(1000):
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=np.random.randint(1, 6))
            for i in indices:
                counts[i] += 1
        print(counts)
        self.assertTrue(
            counts[9] >= counts[8] >= counts[7] >= counts[6] >= counts[5] >=
            counts[4] >= counts[3] >= counts[2] >= counts[1] >= counts[0]
        )

    def test_segment_tree_insert_values(self):
        """
        Tests if segment tree inserts into correct positions.
        """
        memory = PrioritizedReplayBuffer(
            record_space=self.record_space,
            capacity=self.capacity,
            alpha=self.alpha,
            beta=self.beta
        )

        priority_capacity = 1
        while priority_capacity < self.capacity:
            priority_capacity *= 2

        sum_segment_values = memory.merged_segment_tree.sum_segment_tree.values
        min_segment_values = memory.merged_segment_tree.min_segment_tree.values

        self.assertEqual(sum(sum_segment_values), 0)
        self.assertEqual(sum(min_segment_values), float("inf"))
        self.assertEqual(len(sum_segment_values), 2 * priority_capacity)
        self.assertEqual(len(min_segment_values), 2 * priority_capacity)

        # Insert 1 Element.
        observation = self.record_space.sample(size=1)
        memory.add_records(observation)

        # Check insert positions
        # Initial insert is at priority capacity
        print(sum_segment_values)
        print(min_segment_values)
        start = priority_capacity

        while start >= 1:
            self.assertEqual(sum_segment_values[start], 1.0)
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

        # Insert another Element.
        observation = self.record_space.sample(size=1)
        memory.add_records(observation)

        # Index shifted 1
        start = priority_capacity + 1
        self.assertEqual(sum_segment_values[start], 1.0)
        self.assertEqual(min_segment_values[start], 1.0)
        start = int(start / 2)
        while start >= 1:
            # 1 + 1 is 2 on the segment.
            self.assertEqual(sum_segment_values[start], 2.0)
            # min is still 1.
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

    def test_tree_insert(self):
        """
        Tests inserting into the segment tree and querying segments.
        """
        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4        )
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)
        self.assertTrue(np.isclose(tree.get_sum(), 4.0))
        self.assertTrue(np.isclose(tree.get_sum(0, 2), 0.0))
        self.assertTrue(np.isclose(tree.get_sum(0, 3), 1.0))
        self.assertTrue(np.isclose(tree.get_sum(2, 3), 1.0))
        self.assertTrue(np.isclose(tree.get_sum(2, -1), 1.0))
        self.assertTrue(np.isclose(tree.get_sum(2, 4), 4.0))

    def test_prefixsum_idx(self):
        """
        Tests fetching the index corresponding to a prefix sum.
        """
        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)

        self.assertEqual(tree.index_of_prefixsum(0.0), 2)
        self.assertEqual(tree.index_of_prefixsum(0.5), 2)
        self.assertEqual(tree.index_of_prefixsum(0.99), 2)
        self.assertEqual(tree.index_of_prefixsum(1.01), 3)
        self.assertEqual(tree.index_of_prefixsum(3.0), 3)
        self.assertEqual(tree.index_of_prefixsum(4.0), 3)

        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(0, 0.5)
        tree.insert(1, 1.0)
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)
        self.assertEqual(tree.index_of_prefixsum(0.0), 0)
        self.assertEqual(tree.index_of_prefixsum(0.55), 1)
        self.assertEqual(tree.index_of_prefixsum(0.99), 1)
        self.assertEqual(tree.index_of_prefixsum(1.51), 2)
        self.assertEqual(tree.index_of_prefixsum(3.0), 3)
        self.assertEqual(tree.index_of_prefixsum(5.50), 3)
Exemplo n.º 11
0
    def test_joint_cumulative_distribution(self):
        param_space = Dict(
            {
                "a":
                Float(shape=(4, )),  # 4-discrete
                "b":
                Dict({
                    "ba":
                    Tuple([Float(shape=(3, )),
                           Float(0.1, 1.0, shape=(3, ))]),  # 3-variate normal
                    "bb":
                    Tuple([Float(shape=(2, )),
                           Float(shape=(2, ))]),  # beta -1 to 1
                    "bc":
                    Tuple([Float(shape=(4, )),
                           Float(0.1, 1.0, shape=(4, ))]),  # normal (dim=4)
                })
            },
            main_axes="B")

        values_space = Dict(
            {
                "a":
                Int(4),
                "b":
                Dict({
                    "ba": Float(shape=(3, )),
                    "bb": Float(shape=(2, )),
                    "bc": Float(shape=(4, ))
                })
            },
            main_axes="B")

        low, high = -1.0, 1.0
        cumulative_distribution = JointCumulativeDistribution(
            distributions={
                "a": Categorical(),
                "b": {
                    "ba": MultivariateNormal(),
                    "bb": Beta(low=low, high=high),
                    "bc": Normal()
                }
            })

        # Batch of size=2 and deterministic (True).
        input_ = param_space.sample(2)
        input_["a"] = softmax(input_["a"])
        expected_mean = {
            "a": np.argmax(input_["a"], axis=-1),
            "b": {
                "ba":
                input_["b"]["ba"][0],  # [0]=Mean
                # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low
                "bb":
                (1.0 / (1.0 + input_["b"]["bb"][1] / input_["b"]["bb"][0])) *
                (high - low) + low,
                "bc":
                input_["b"]["bc"][0],
            }
        }
        # Sample n times, expect always mean value (deterministic draw).
        for _ in range(20):
            out = cumulative_distribution.sample(input_, deterministic=True)
            check(out, expected_mean)
            out = cumulative_distribution.sample_deterministic(input_)
            check(out, expected_mean)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(1)
        input_["a"] = softmax(input_["a"])
        expected_mean = {
            "a": np.sum(input_["a"] * np.array([0, 1, 2, 3])),
            "b": {
                "ba":
                input_["b"]["ba"][0],  # [0]=Mean
                # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low
                "bb":
                (1.0 / (1.0 + input_["b"]["bb"][1] / input_["b"]["bb"][0])) *
                (high - low) + low,
                "bc":
                input_["b"]["bc"][0],
            }
        }

        outs = []
        for _ in range(500):
            out = cumulative_distribution.sample(input_)
            outs.append(out)
            out = cumulative_distribution.sample_stochastic(input_)
            outs.append(out)

        check(np.mean(np.stack([o["a"][0] for o in outs], axis=0), axis=0),
              expected_mean["a"],
              atol=0.3)
        check(np.mean(np.stack([o["b"]["ba"][0] for o in outs], axis=0),
                      axis=0),
              expected_mean["b"]["ba"][0],
              decimals=1)
        check(np.mean(np.stack([o["b"]["bb"][0] for o in outs], axis=0),
                      axis=0),
              expected_mean["b"]["bb"][0],
              decimals=1)
        check(np.mean(np.stack([o["b"]["bc"][0] for o in outs], axis=0),
                      axis=0),
              expected_mean["b"]["bc"][0],
              decimals=1)

        # Test log-likelihood outputs.
        params = param_space.sample(1)
        params["a"] = softmax(params["a"])
        # Make sure beta-values are within 0.0 and 1.0 for the numpy calculation (which doesn't have scaling).
        values = values_space.sample(1)
        log_prob_beta = np.log(
            beta.pdf(values["b"]["bb"], params["b"]["bb"][0],
                     params["b"]["bb"][1]))
        # Now do the scaling for b/bb (beta values).
        values["b"]["bb"] = values["b"]["bb"] * (high - low) + low
        expected_log_llh = np.log(params["a"][0][values["a"][0]]) + \
            np.sum(np.log(norm.pdf(values["b"]["ba"][0], params["b"]["ba"][0], params["b"]["ba"][1]))) + \
            np.sum(log_prob_beta) + \
            np.sum(np.log(norm.pdf(values["b"]["bc"][0], params["b"]["bc"][0], params["b"]["bc"][1])))

        out = cumulative_distribution.log_prob(params, values)
        check(out, expected_log_llh, decimals=0)
Exemplo n.º 12
0
    def test_squashed_normal(self):
        param_space = Tuple(Float(-1.0, 1.0, shape=(5, )),
                            Float(0.5, 1.0, shape=(5, )),
                            main_axes="B")

        low, high = -2.0, 1.0
        squashed_distribution = SquashedNormal(low=low, high=high)

        # Batch of size=2 and deterministic (True).
        input_ = param_space.sample(2)
        expected = ((np.tanh(input_[0]) + 1.0) /
                    2.0) * (high - low) + low  # [0] = mean
        # Sample n times, expect always mean value (deterministic draw).
        for _ in range(50):
            out = squashed_distribution.sample(input_, deterministic=True)
            check(out, expected)
            out = squashed_distribution.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(1)
        expected = ((np.tanh(input_[0]) + 1.0) /
                    2.0) * (high - low) + low  # [0] = mean
        outs = []
        for _ in range(500):
            out = squashed_distribution.sample(input_, deterministic=False)
            outs.append(out)
            self.assertTrue(np.max(out) <= high)
            self.assertTrue(np.min(out) >= low)
            out = squashed_distribution.sample_stochastic(input_)
            outs.append(out)
            self.assertTrue(np.max(out) <= high)
            self.assertTrue(np.min(out) >= low)

        check(np.mean(outs), expected.mean(), decimals=1)

        means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0],
                          [-0.1, -0.2, -0.3, -0.4, -1.0]])
        log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 10.0],
                             [0.7, -0.3, 0.4, -0.9, 8.0]])
        # The normal-adapter does this following line with the NN output (interpreted as log(stddev)):
        # Doesn't really matter here in this test case, though.
        stds = np.exp(
            np.clip(log_stds, a_min=MIN_LOG_NN_OUTPUT,
                    a_max=MAX_LOG_NN_OUTPUT))
        # Make sure values are within low and high.
        values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05],
                           [-0.9, -0.2, 0.4, -0.1, -1.05]])

        # Test log-likelihood outputs.
        # TODO: understand and comment the following formula to get the log-prob.
        # Unsquash values, then get log-llh from regular gaussian.
        unsquashed_values = np.arctanh((values - low) / (high - low) * 2.0 -
                                       1.0)
        log_prob_unsquashed = np.log(norm.pdf(unsquashed_values, means, stds))
        log_prob = log_prob_unsquashed - np.sum(
            np.log(1 - np.tanh(unsquashed_values)**2), axis=-1, keepdims=True)

        out = squashed_distribution.log_prob((means, stds), values)
        check(out, log_prob)

        # Test entropy outputs.
        # TODO
        return
Exemplo n.º 13
0
    def test_mixture(self):
        # Create a mixture distribution consisting of 3 bivariate normals weighted by an internal
        # categorical distribution.
        num_distributions = 3
        num_events_per_multivariate = 2  # 2=bivariate
        param_space = Dict(
            {
                "categorical":
                Float(shape=(num_distributions, ), low=-1.5, high=2.3),
                "parameters0":
                Tuple(
                    Float(shape=(num_events_per_multivariate, )),  # mean
                    Float(shape=(num_events_per_multivariate, ),
                          low=0.5,
                          high=1.0),  # diag
                ),
                "parameters1":
                Tuple(
                    Float(shape=(num_events_per_multivariate, )),  # mean
                    Float(shape=(num_events_per_multivariate, ),
                          low=0.5,
                          high=1.0),  # diag
                ),
                "parameters2":
                Tuple(
                    Float(shape=(num_events_per_multivariate, )),  # mean
                    Float(shape=(num_events_per_multivariate, ),
                          low=0.5,
                          high=1.0),  # diag
                ),
            },
            main_axes="B")
        values_space = Float(shape=(num_events_per_multivariate, ),
                             main_axes="B")
        # The Component to test.
        mixture = MixtureDistribution(
            # Try different spec types.
            MultivariateNormal(),
            "multi-variate-normal",
            "multivariate_normal")

        # Batch of size=n and deterministic (True).
        input_ = param_space.sample(1)
        # Make probs for categorical.
        categorical_probs = softmax(input_["categorical"])

        # Note: Usually, the deterministic draw should return the max-likelihood value
        # Max-likelihood for a 3-Mixed Bivariate: mean-of-argmax(categorical)()
        # argmax = np.argmax(input_[0]["categorical"], axis=-1)
        #expected = np.array([input_[0]["parameters{}".format(idx)][0][i] for i, idx in enumerate(argmax)])
        #    input_[0]["categorical"][:, 1:2] * input_[0]["parameters1"][0] + \
        #    input_[0]["categorical"][:, 2:3] * input_[0]["parameters2"][0]

        # The mean value is a 2D vector (bivariate distribution).
        expected = categorical_probs[:, 0:1] * input_["parameters0"][0] + \
            categorical_probs[:, 1:2] * input_["parameters1"][0] + \
            categorical_probs[:, 2:3] * input_["parameters2"][0]

        for _ in range(20):
            out = mixture.sample(input_, deterministic=True)
            check(out, expected)
            out = mixture.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(1)
        # Make probs for categorical.
        categorical_probs = softmax(input_["categorical"])
        expected = categorical_probs[:, 0:1] * input_["parameters0"][0] + \
            categorical_probs[:, 1:2] * input_["parameters1"][0] + \
            categorical_probs[:, 2:3] * input_["parameters2"][0]
        outs = []
        for _ in range(500):
            out = mixture.sample(input_, deterministic=False)
            outs.append(out)
            out = mixture.sample_stochastic(input_)
            outs.append(out)
        check(np.mean(np.array(outs), axis=0), expected, decimals=1)

        return
        # TODO: prob/log-prob tests for Mixture.

        # Test log-likelihood outputs (against scipy).
        for i in range(20):
            params = param_space.sample(1)
            # Make sure categorical params are softmaxed.
            category_probs = softmax(params["categorical"][0])
            values = values_space.sample(1)
            expected = 0.0
            v = []
            for j in range(3):
                v.append(
                    multivariate_normal.pdf(
                        values[0],
                        mean=params["parameters{}".format(j)][0][0],
                        cov=params["parameters{}".format(j)][1][0]))
                expected += category_probs[j] * v[-1]
            out = mixture.prob(params, values)
            check(out[0], expected, atol=0.1)

            expected = np.zeros(shape=(3, ))
            for j in range(3):
                expected[j] = np.log(category_probs[j]) + np.log(
                    multivariate_normal.pdf(
                        values[0],
                        mean=params["parameters{}".format(j)][0][0],
                        cov=params["parameters{}".format(j)][1][0]))
            expected = np.log(np.sum(np.exp(expected)))
            out = mixture.log_prob(params, values)
            print("{}: out={} expected={}".format(i, out, expected))
            check(out, np.array([expected]), atol=0.25)
Exemplo n.º 14
0
class DADS(RLAlgo):
    """
    The DADS algorithm.
    [1] Dynamics-Aware Unsupervised Discovery of Skills - A. Sharma∗, S. Gu, S. Levine, V. Kumar, K. Hausman - Google Brain 2019
        Compare to "Algorithm 1" and "Algorithm 2" pseudocodes in paper.
    """
    def __init__(self, config, name=None):
        super().__init__(config, name)
        self.inference = False  # True=planning mode. False="supervised+intrinsic-reward+model-learning" mode.
        self.he = 0  # Current step within He (total episode horizon).
        self.hz = 0  # Current step within Hz (repeat horizon for one selected skill)

        self.preprocessor = Preprocessor.make(config.preprocessor)
        self.s = self.preprocessor(
            config.state_space.with_batch())  # preprocessed states
        self.a = config.action_space.with_batch()  # actions (a)
        self.ri = Float(main_axes=[("Episode Horizon", config.episode_horizon)
                                   ])  # intrinsic rewards in He
        self.z = Float(-1.0, 1.0, shape=(config.dim_skill_vectors,), main_axes="B") if \
            config.discrete_skills is False else Int(config.dim_skill_vectors, main_axes="B")
        self.s_and_z = Dict(dict(s=self.s, z=self.z), main_axes="B")
        self.pi = Network.make(input_space=self.s_and_z,
                               output_space=self.a,
                               **config.policy_network)
        self.q = Network.make(input_space=self.s_and_z,
                              output_space=self.s,
                              distributions=dict(
                                  type="mixture",
                                  num_experts=config.num_q_experts),
                              **config.q_network)
        self.B = FIFOBuffer(Dict(dict(s=self.s, z=self.z, a=self.a, t=bool)),
                            config.episode_buffer_capacity,
                            when_full=self.event_buffer_full,
                            next_record_setup=dict(s="s_"))
        self.SAC = SAC(config=config.sac_config,
                       name="SAC-level0")  # Low-level SAC.
        self.q_optimizer = Optimizer.make(
            config.supervised_optimizer)  # supervised model optimizer
        self.Lsup = NegLogLikelihoodLoss(distribution=MixtureDistribution(
            num_experts=config.num_q_experts))
        self.preprocessor.reset()

    def update(self, samples, time_percentage):
        parameters = self.q(dict(s=samples["s"], z=samples["z"]),
                            parameters_only=True)

        # Update for K1 (num_steps_per_supervised_update) iterations on same batch.
        weights = self.q.get_weights(as_ref=True)
        s_ = samples["s_"] if self.config.q_predicts_states_diff is False else \
            tf.nest.map_structure(lambda s, s_: s_ - s, samples["s"], samples["s_"])
        for _ in range(self.config.num_steps_per_supervised_update):
            loss = self.Lsup(parameters, s_)
            self.q_optimizer.apply_gradients(loss,
                                             weights,
                                             time_percentage=time_percentage)

        # Calculate intrinsic rewards.
        # Pull a batch of zs of size batch * (L - 1) (b/c 1 batch is the `z` of the sample (numerator's z)).
        batch_size = len(samples["s"])
        zs = tf.concat([
            samples["z"],
            self.z.sample(batch_size *
                          (self.config.num_denominator_samples_for_ri - 1))
        ])
        s = tf.nest.map_structure(
            lambda s: tf.tile(s, [self.config.num_denominator_samples_for_ri] +
                              ([1] * (len(s.shape) - 1))), samples["s"])
        s_ = tf.nest.map_structure(
            lambda s: tf.tile(s, [self.config.num_denominator_samples_for_ri] +
                              ([1] * (len(s.shape) - 1))), samples["s_"])
        # Single (efficient) forward pass yielding s' likelihoods.
        all_s__llhs = tf.stack(
            tf.split(self.q(dict(s=s, z=zs), s_, likelihood=True),
                     self.config.num_denominator_samples_for_ri))
        r = tf.math.log(all_s__llhs[0] / tf.reduce_sum(all_s__llhs, axis=0)) + \
            tf.math.log(self.config.num_denominator_samples_for_ri)
        # Update RL-algo's policy (same as π) from our batch (using intrinsic rewards).
        self.SAC.update(
            dict(s=samples["s"],
                 z=samples["z"],
                 a=samples["a"],
                 r=r,
                 s_=samples["s_"],
                 t=samples["t"]), time_percentage)

    # When buffer full -> Update transition model q.
    def event_buffer_full(self, event):
        self.update(self.B.flush(),
                    time_percentage=event.actor_time_steps /
                    (self.config.max_time_steps or event.env.max_time_steps))

    def event_episode_starts(self, event):
        # Initialize z if this hasn't happened yet.
        if self.z.value is None:
            self.z.assign(self.z.zeros(len(event.actor_slots)))
        # Sample new z at the trajectory's batch position.
        if self.inference is False:
            self.z.value[event.current_actor_slot] = self.z.sample(
            )  # Sample a new skill from Space z and store it in z (assume uniform).
        # Reset preprocessor at actor's batch position.
        self.preprocessor.reset(batch_position=event.current_actor_slot)

    # Fill the buffer with M samples.
    def event_tick(self, event):
        # Preprocess state.
        s_ = self.preprocessor(event.s_)

        ## If we are in inference mode -> do a planning step (rather than just act).
        #if self.inference:
        #    self.he += 1
        #    if self.he >= self.config.He:  # We have reached the end of the total episode horizon -> reset.
        #        env.reset()  # Send reset request to env.
        #        return
        #    self.plan(env.s)
        #    # Execute selected skill for Hz steps.
        #    if self.hz == self.config.Hz - 1:
        #        zi = self.N.sample()   # ?? ~ N[he/Hz]
        #        hz = 0  # reset counter
        #    hz += 1
        #else:
        for i in event.actor_slots:
            if self.hz[i] >= self.config.skill_horizon:
                self.z.value[i] = self.z.sample()

        # Add single(!) szas't-tuple to buffer.
        if event.actor_time_steps > 0:
            self.B.add_records(
                dict(s=self.s.value,
                     z=self.z.value,
                     a=self.a.value,
                     t=event.t,
                     s_=event.s_))

        # Query policy for an action.
        a_ = self.pi(dict(s=event.s_, z=self.z.value))

        # Send the new action back to the env.
        event.env.act(a_)

        # Store action and state for next tick.
        self.s.assign(s_)
        self.a.assign(a_)
Exemplo n.º 15
0
    def __init__(
            self,
            *,
            policy_network,
            q_network,
            state_space,
            action_space,
            sac_config,
            num_q_experts=4,  # 4 used in paper.
            q_predicts_states_diff=False,
            num_denominator_samples_for_ri=250,  # 50-500 used in paper
            dim_skill_vectors=10,
            discrete_skills=False,
            episode_horizon=200,
            skill_horizon=None,
            preprocessor=None,
            supervised_optimizer=None,
            num_steps_per_supervised_update=1,
            episode_buffer_capacity=200,
            summaries=None):
        """
        Args:
            policy_network (Network): The policy-network (pi) to use as a function approximator for the learnt policy.

            q_network (Network): The dynamics-network (q) to use as a function approximator for the learnt env
                dynamics. NOTE: Not to be confused with a Q-learning Q-net! In the paper, the dynamics function is
                called `q`, hence the same nomenclature here.

            state_space (Space): The state/observation Space.
            action_space (Space): The action Space.
            sac_config (SACConfig): The config for the internal SAC-Algo used to learn the skills using intrinsic rewards.

            num_q_experts (int): The number of experts used in the Mixture distribution output bz the q-network to
                predict the next state (s') given s (state) and z (skill vector).

            q_predicts_states_diff (bool): Whether the q-network predicts the different between s and s' rather than
                s' directly. Default: False.

            num_denominator_samples_for_ri (int): The number of samples to calculate for the denominator of the
                intrinsic reward function (`L` in the paper).

            dim_skill_vectors (int): The number of dimensions of the learnt skill vectors.
            discrete_skills (bool): Whether skill vectors are discrete (one-hot).
            episode_horizon (int): The episode horizon (He) to move within, when gathering episode samples.

            skill_horizon (Optional[int]): The horizon for which to use one skill vector (before sampling a new one).
                Default: Use value of `episode_horizon`.

            preprocessor (Preprocessor): The preprocessor (if any) to use.
            supervised_optimizer (Optimizer): The optimizer to use for the supervised (q) model learning task.

            num_steps_per_supervised_update (int): The number of gradient descent iterations per update
                (each iteration uses the same environment samples).

            episode_buffer_capacity (int): The capacity of the episode (experience) FIFOBuffer.

            summaries (List[any]): A list of summaries to produce if `UseTfSummaries` in debug.json is true.
                In the simplest case, this is a list of `self.[...]`-property names of the SAC object that should
                be tracked after each tick.
        """
        # Clean up network configs to be passable as **kwargs to `make`.
        # Networks are given as sequential config or directly as Keras objects -> prepend "network" key to spec.
        if isinstance(
                policy_network,
            (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)):
            policy_network = dict(network=policy_network)
        if isinstance(
                q_network,
            (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)):
            q_network = dict(network=q_network)

        # Make state/action space.
        state_space = Space.make(state_space)
        action_space = Space.make(action_space)

        # Fix SAC config, add correct state- and action-spaces.
        sac_config = SACConfig.make(
            sac_config,
            state_space=Dict(s=state_space,
                             z=Float(-1.0, 1.0, shape=(dim_skill_vectors, ))),
            action_space=action_space,
            # Use no memory. Updates are done from DADS' own buffer.
            memory_capacity=1,
            memory_batch_size=1,
            # Share policy network between DADS and underlying learning SAC.
            policy_network=policy_network)

        if skill_horizon is None:
            skill_horizon = episode_horizon

        super().__init__(
            locals())  # Config will store all c'tor variables automatically.

        # Keep track of which time-step stuff happened. Only important for by-time-step frequencies.
        self.last_update = 0