예제 #1
0
파일: dqn-her.py 프로젝트: ymd-h/cpprb
class BitFlippingEnv(gym.Env):
    """
    bit-flipping environment: https://arxiv.org/abs/1707.01495

    * Environment has n-bit state.
    * Initial state and goal state are randomly selected.
    * Action is one of the 0, ..., n-1, which flips single bit
    * Reward is 0 if state == goal, otherwise reward is -1. (Sparse Binary Reward)

    Simple RL algorithms tend to fail for large ``n`` like ``n > 40``
    """
    def __init__(self, n):
        seeds = np.random.SeedSequence().spawn(3)
        self.np_random = np.random.default_rng(seeds[0])
        self.observation_space = Box(low=0, high=1, shape=(n,), dtype=int)
        self.action_space = Discrete(self.n)
        self.observation_space.seed(seeds[1].entropy)
        self.action_space.seed(seeds[2].entropy)

    def step(self, action):
        action = int(action)
        self.bit[action] = 1 - self.bit[action]
        done = (self.bit == self.goal).all()
        rew = 0 if done else -1
        return self.bit.copy(), rew, done, {}

    def reset(self):
        self.bit = self.np_random.integers(low=0, high=1, size=self.action_space.n,
                                           endpoint=True, dtype=int)
        self.goal = self.np_random.integers(low=0, high=1, size=self.action_space.n,
                                            endpoint=True, dtype=int)
        return self.bit.copy()
예제 #2
0
    def test_gumbel_softmax(self):
        """Tests the GumbelSoftmax ActionDistribution (tf + eager only)."""
        for fw, sess in framework_iterator(frameworks=("tf2", "tf", "tfe"),
                                           session=True):
            batch_size = 1000
            num_categories = 5
            input_space = Box(-1.0, 1.0, shape=(batch_size, num_categories))
            input_space.seed(42)

            # Batch of size=n and deterministic.
            inputs = input_space.sample()
            gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0)

            expected = softmax(inputs)
            # Sample n times, expect always mean value (deterministic draw).
            out = gumbel_softmax.deterministic_sample()
            check(out, expected)

            # Batch of size=n and non-deterministic -> expect roughly that
            # the max-likelihood (argmax) ints are output (most of the time).
            inputs = input_space.sample()
            gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0)
            expected_mean = np.mean(np.argmax(inputs, -1)).astype(np.float32)
            outs = gumbel_softmax.sample()
            if sess:
                outs = sess.run(outs)
            check(np.mean(np.argmax(outs, -1)), expected_mean, rtol=0.08)
예제 #3
0
def test_backward() -> None:
    """
    Test backward(). We just want to make sure that the gradient with respect to the
    i-th task loss is zero for all parameters in output head j != i, and is nonzero for
    all parameters in output head i.
    """

    # Set up case.
    dim = SETTINGS["obs_dim"] + SETTINGS["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(SETTINGS["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network.
    network = MultiTaskTrunkNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=SETTINGS["num_tasks"],
        num_shared_layers=SETTINGS["num_shared_layers"],
        num_task_layers=SETTINGS["num_task_layers"],
        hidden_size=hidden_size,
        downscale_last_layer=True,
        device=SETTINGS["device"],
    )

    # Construct batch of observations concatenated with one-hot task vectors.
    obs, task_indices = get_obs_batch(
        batch_size=SETTINGS["num_processes"],
        obs_space=observation_subspace,
        num_tasks=SETTINGS["num_tasks"],
    )

    # Make sure every task gets at least one process.
    assert set(task_indices.tolist()) == set(range(SETTINGS["num_tasks"]))

    # Get output of network.
    output = network(obs, task_indices)

    # Compute losses (we just compute the squared network output to keep it simple) and
    # test gradients.
    for i in range(SETTINGS["num_tasks"]):

        # Zero out gradients.
        network.zero_grad()

        # Compute loss over outputs from the current task.
        loss = torch.zeros(1)
        for process in range(obs.shape[0]):
            j = task_indices[process].item()
            if i == j:
                loss += torch.sum(output[process]**2)

        # Test gradients.
        loss.backward(retain_graph=True)
        check_gradients(network.trunk, nonzero=True)
        for j in range(SETTINGS["num_tasks"]):
            nonzero = j == i
            check_gradients(network.output_heads[j], nonzero=nonzero)
예제 #4
0
    def test_categorical(self):
        batch_size = 10000
        num_categories = 4
        # Create categorical distribution with n categories.
        inputs_space = Box(
            -1.0, 2.0, shape=(batch_size, num_categories), dtype=np.float32)
        inputs_space.seed(42)
        values_space = Box(
            0, num_categories - 1, shape=(batch_size, ), dtype=np.int32)
        values_space.seed(42)

        inputs = inputs_space.sample()

        for fw, sess in framework_iterator(session=True):
            # Create the correct distribution object.
            cls = JAXCategorical if fw == "jax" else Categorical if \
                fw != "torch" else TorchCategorical
            categorical = cls(inputs, {})

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(
                cls,
                inputs_space.shape,
                fw=fw,
                sess=sess,
                bounds=(0, num_categories - 1))

            # Batch of size=3 and deterministic (True).
            expected = np.transpose(np.argmax(inputs, axis=-1))
            # Sample, expect always max value
            # (max likelihood for deterministic draw).
            out = categorical.deterministic_sample()
            check(out, expected)

            # Batch of size=3 and non-deterministic -> expect roughly the mean.
            out = categorical.sample()
            check(
                np.mean(out) if fw == "jax" else tf.reduce_mean(out)
                if fw != "torch" else torch.mean(out.float()),
                1.0,
                decimals=0)

            # Test log-likelihood outputs.
            probs = softmax(inputs)
            values = values_space.sample()

            out = categorical.logp(values
                                   if fw != "torch" else torch.Tensor(values))
            expected = []
            for i in range(batch_size):
                expected.append(np.sum(np.log(np.array(probs[i][values[i]]))))
            check(out, expected, decimals=4)

            # Test entropy outputs.
            out = categorical.entropy()
            expected_entropy = -np.sum(probs * np.log(probs), -1)
            check(out, expected_entropy)
예제 #5
0
    def test_beta(self):
        input_space = Box(-2.0, 1.0, shape=(2000, 10))
        input_space.seed(42)
        low, high = -1.0, 2.0
        plain_beta_value_space = Box(0.0, 1.0, shape=(2000, 5))
        plain_beta_value_space.seed(42)

        for fw, sess in framework_iterator(session=True):
            cls = TorchBeta if fw == "torch" else Beta
            inputs = input_space.sample()
            beta_distribution = cls(inputs, {}, low=low, high=high)

            inputs = beta_distribution.inputs
            if sess:
                inputs = sess.run(inputs)
            else:
                inputs = inputs.numpy()
            alpha, beta_ = np.split(inputs, 2, axis=-1)

            # Mean for a Beta distribution: 1 / [1 + (beta/alpha)]
            expected = (1.0 / (1.0 + beta_ / alpha)) * (high - low) + low
            # Sample n times, expect always mean value (deterministic draw).
            out = beta_distribution.deterministic_sample()
            check(out, expected, rtol=0.01)

            # Batch of size=n and non-deterministic -> expect roughly the mean.
            values = beta_distribution.sample()
            if sess:
                values = sess.run(values)
            else:
                values = values.numpy()
            self.assertTrue(np.max(values) <= high)
            self.assertTrue(np.min(values) >= low)

            check(np.mean(values), expected.mean(), decimals=1)

            # Test log-likelihood outputs (against scipy).
            inputs = input_space.sample()
            beta_distribution = cls(inputs, {}, low=low, high=high)
            inputs = beta_distribution.inputs
            if sess:
                inputs = sess.run(inputs)
            else:
                inputs = inputs.numpy()
            alpha, beta_ = np.split(inputs, 2, axis=-1)

            values = plain_beta_value_space.sample()
            values_scaled = values * (high - low) + low
            if fw == "torch":
                values_scaled = torch.Tensor(values_scaled)
            print(values_scaled)
            out = beta_distribution.logp(values_scaled)
            check(
                out,
                np.sum(np.log(beta.pdf(values, alpha, beta_)), -1),
                rtol=0.01)
예제 #6
0
def test_forward() -> None:
    """
    Test forward() when each task-specific output head multiplies the shared trunk
    output by some constant factor, and the task index is included in the input.
    """

    # Set up case.
    dim = SETTINGS["obs_dim"] + SETTINGS["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(SETTINGS["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network and set weights of each output head explicitly. We want to make
    # it so that f_i(x) = x * i + i (with broadcasted operations), where the i-th output
    # head is f_i.
    network = MultiTaskTrunkNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=SETTINGS["num_tasks"],
        num_shared_layers=SETTINGS["num_shared_layers"],
        num_task_layers=SETTINGS["num_task_layers"],
        hidden_size=hidden_size,
        downscale_last_layer=True,
        device=SETTINGS["device"],
    )
    for i in range(SETTINGS["num_tasks"]):

        # Set weights.
        state_dict = network.output_heads[i].state_dict()
        state_dict["0.0.weight"] = torch.Tensor(i * np.identity(dim))
        state_dict["0.0.bias"] = torch.Tensor(i * np.ones(dim))
        network.output_heads[i].load_state_dict(state_dict)

    # Construct batch of observations concatenated with one-hot task vectors.
    obs, task_indices = get_obs_batch(
        batch_size=SETTINGS["num_processes"],
        obs_space=observation_subspace,
        num_tasks=SETTINGS["num_tasks"],
    )

    # Get output of network.
    output = network(obs, task_indices)

    # Construct expected output of network.
    trunk_output = network.trunk(obs)
    expected_output_list = []
    for i, task_index in enumerate(task_indices):
        expected_output_list.append(trunk_output[i] * task_index + task_index)
    expected_output = torch.stack(expected_output_list)

    # Test output of network.
    assert torch.allclose(output, expected_output)
예제 #7
0
def meta_forward_template(
    settings: Dict[str, Any],
    state_dict: Dict[str, torch.Tensor],
    splits_args: List[Dict[str, Any]],
    alpha: List[torch.Tensor],
    get_expected_output: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
) -> None:
    """ Test MetaSplittingNetwork.forward() correct computes network output. """

    # Construct multi-task network.
    multitask_network = BaseMultiTaskSplittingNetwork(
        input_size=settings["input_size"],
        output_size=settings["output_size"],
        num_tasks=settings["num_tasks"],
        num_layers=settings["num_layers"],
        hidden_size=settings["hidden_size"],
        device=settings["device"],
    )

    # Split the network according to `splits_args`.
    for split_args in splits_args:
        multitask_network.split(**split_args)

    # Load state dict.
    multitask_network.load_state_dict(state_dict)

    # Construct MetaSplittingNetwork from BaseMultiTaskSplittingNetwork.
    meta_network = MetaSplittingNetwork(
        multitask_network,
        num_test_tasks=settings["num_tasks"],
        device=settings["device"],
    )

    # Set alpha weights of meta network.
    for layer in range(meta_network.num_layers):
        meta_network.alpha[layer].data = alpha[layer]

    # Construct batch of observations concatenated with one-hot task vectors.
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(settings["obs_dim"], ))
    observation_subspace.seed(settings["seed"])
    obs, task_indices = get_obs_batch(
        batch_size=settings["num_processes"],
        obs_space=observation_subspace,
        num_tasks=settings["num_tasks"],
    )

    # Get and test output of network.
    output = meta_network(obs, task_indices)
    expected_output = get_expected_output(obs, task_indices)
    assert torch.allclose(output, expected_output)
예제 #8
0
def grad_diffs_template(settings: Dict[str, Any], grad_type: str) -> None:
    """
    Test that `get_task_grad_diffs()` correctly computes the pairwise difference between
    task-specific gradients at each region.
    """

    # Set up case.
    dim = settings["obs_dim"] + settings["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(settings["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network.
    network = BaseMultiTaskSplittingNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=settings["num_tasks"],
        num_layers=settings["num_layers"],
        hidden_size=hidden_size,
        device=settings["device"],
    )

    # Construct dummy task gradients.
    if grad_type == "zero":
        task_grads = torch.zeros(network.num_tasks, network.num_regions,
                                 network.max_region_size)
    elif grad_type == "rand_identical":
        task_grads = torch.rand(1, network.num_regions,
                                network.max_region_size)
        task_grads = task_grads.expand(network.num_tasks, -1, -1)
        pass
    elif grad_type == "rand":
        pass
        task_grads = torch.rand(network.num_tasks, network.num_regions,
                                network.max_region_size)
    else:
        raise NotImplementedError

    # Compute pairwise differences of task gradients.
    task_grad_diffs = network.get_task_grad_diffs(task_grads)

    # Check computed differences.
    for task1, task2 in product(range(network.num_tasks),
                                range(network.num_tasks)):
        for region in range(network.num_regions):
            expected_diff = torch.sum(
                torch.pow(
                    task_grads[task1, region] - task_grads[task2, region], 2))
            assert torch.allclose(task_grad_diffs[task1, task2, region],
                                  expected_diff)
예제 #9
0
def test_forward_shared() -> None:
    """
    Test forward() when all regions of the splitting network are fully shared. The
    function computed by the network should be f(x) = 3 * tanh(2 * tanh(x + 1) + 2) + 3.
    """

    # Set up case.
    dim = BASE_SETTINGS["obs_dim"] + BASE_SETTINGS["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(BASE_SETTINGS["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network.
    network = BaseMultiTaskSplittingNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=BASE_SETTINGS["num_tasks"],
        num_layers=BASE_SETTINGS["num_layers"],
        hidden_size=hidden_size,
        device=BASE_SETTINGS["device"],
    )

    # Set network weights.
    state_dict = network.state_dict()
    for i in range(BASE_SETTINGS["num_layers"]):
        weight_name = "regions.%d.0.0.weight" % i
        bias_name = "regions.%d.0.0.bias" % i
        state_dict[weight_name] = torch.Tensor((i + 1) * np.identity(dim))
        state_dict[bias_name] = torch.Tensor((i + 1) * np.ones(dim))
    network.load_state_dict(state_dict)

    # Construct batch of observations concatenated with one-hot task vectors.
    obs, task_indices = get_obs_batch(
        batch_size=BASE_SETTINGS["num_processes"],
        obs_space=observation_subspace,
        num_tasks=BASE_SETTINGS["num_tasks"],
    )

    # Get output of network.
    output = network(obs, task_indices)

    # Computed expected output of network.
    expected_output = 3 * torch.tanh(2 * torch.tanh(obs + 1) + 2) + 3

    # Test output of network.
    assert torch.allclose(output, expected_output)
예제 #10
0
class RandomTeacher(AbstractTeacher):
    def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub):
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed)

        self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32)
        self.random_task_generator.seed(self.seed)

    def sample_task(self):
        return self.random_task_generator.sample()

    def non_exploratory_task_sampling(self):
        return {"task": self.sample_task(),
                "infos": {
                    "bk_index": -1,
                    "task_infos": None}
                }
예제 #11
0
def test_split_multiple() -> None:
    """
    Test that split() correctly sets new parameters when we perform multiple splits.
    """

    # Set up case.
    dim = BASE_SETTINGS["obs_dim"] + BASE_SETTINGS["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(BASE_SETTINGS["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network.
    network = BaseMultiTaskSplittingNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=BASE_SETTINGS["num_tasks"],
        num_layers=BASE_SETTINGS["num_layers"],
        hidden_size=hidden_size,
        device=BASE_SETTINGS["device"],
    )

    # Split the network at the first layer once and the last layer twice.
    network.split(0, 0, [0, 1], [2, 3])
    network.split(2, 0, [0, 2], [1, 3])
    network.split(2, 1, [1], [3])

    # Check the parameters of the network.
    param_names = [name for name, param in network.named_parameters()]

    # Construct expected parameters of network.
    region_copies = {i: [0] for i in range(BASE_SETTINGS["num_layers"])}
    region_copies[0].extend([1])
    region_copies[2].extend([1, 2])
    expected_params = []
    for region, copies in region_copies.items():
        for copy in copies:
            expected_params.append("regions.%d.%d.0.weight" % (region, copy))
            expected_params.append("regions.%d.%d.0.bias" % (region, copy))

    # Test actual parameter names.
    assert set(param_names) == set(expected_params)
예제 #12
0
class RandomTeacher():
    def __init__(self, mins, maxs, seed=None):
        self.seed = seed
        if not seed:
            self.seed = np.random.randint(42,424242)
        np.random.seed(self.seed)

        self.mins = mins
        self.maxs = maxs

        self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32)
        self.random_task_generator.seed(self.seed)

    def update(self, task, competence):
        pass

    def sample_task(self):
        return self.random_task_generator.sample()

    def dump(self, dump_dict):
        return dump_dict
예제 #13
0
def test_seed_Dict():
    test_space = Dict(
        {
            "a": Box(low=0, high=1, shape=(3, 3)),
            "b": Dict(
                {
                    "b_1": Box(low=-100, high=100, shape=(2,)),
                    "b_2": Box(low=-1, high=1, shape=(2,)),
                }
            ),
            "c": Discrete(5),
        }
    )

    seed_dict = {
        "a": 0,
        "b": {
            "b_1": 1,
            "b_2": 2,
        },
        "c": 3,
    }

    test_space.seed(seed_dict)

    # "Unpack" the dict sub-spaces into individual spaces
    a = Box(low=0, high=1, shape=(3, 3))
    a.seed(0)
    b_1 = Box(low=-100, high=100, shape=(2,))
    b_1.seed(1)
    b_2 = Box(low=-1, high=1, shape=(2,))
    b_2.seed(2)
    c = Discrete(5)
    c.seed(3)

    for i in range(10):
        test_s = test_space.sample()
        a_s = a.sample()
        assert (test_s["a"] == a_s).all()
        b_1_s = b_1.sample()
        assert (test_s["b"]["b_1"] == b_1_s).all()
        b_2_s = b_2.sample()
        assert (test_s["b"]["b_2"] == b_2_s).all()
        c_s = c.sample()
        assert test_s["c"] == c_s
예제 #14
0
    def test_multi_categorical(self):
        batch_size = 100
        num_categories = 3
        num_sub_distributions = 5
        # Create 5 categorical distributions of 3 categories each.
        inputs_space = Box(-1.0,
                           2.0,
                           shape=(batch_size,
                                  num_sub_distributions * num_categories))
        inputs_space.seed(42)
        values_space = Box(
            0,
            num_categories - 1,
            shape=(num_sub_distributions, batch_size),
            dtype=np.int32,
        )
        values_space.seed(42)

        inputs = inputs_space.sample()
        input_lengths = [num_categories] * num_sub_distributions
        inputs_split = np.split(inputs, num_sub_distributions, axis=1)

        for fw, sess in framework_iterator(session=True):
            # Create the correct distribution object.
            cls = MultiCategorical if fw != "torch" else TorchMultiCategorical
            multi_categorical = cls(inputs, None, input_lengths)

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(
                cls,
                inputs_space.shape,
                fw=fw,
                sess=sess,
                bounds=(0, num_categories - 1),
                extra_kwargs={"input_lens": input_lengths},
            )

            # Batch of size=3 and deterministic (True).
            expected = np.transpose(np.argmax(inputs_split, axis=-1))
            # Sample, expect always max value
            # (max likelihood for deterministic draw).
            out = multi_categorical.deterministic_sample()
            check(out, expected)

            # Batch of size=3 and non-deterministic -> expect roughly the mean.
            out = multi_categorical.sample()
            check(
                tf.reduce_mean(out)
                if fw != "torch" else torch.mean(out.float()),
                1.0,
                decimals=0,
            )

            # Test log-likelihood outputs.
            probs = softmax(inputs_split)
            values = values_space.sample()

            out = multi_categorical.logp(values if fw != "torch" else [
                torch.Tensor(values[i]) for i in range(num_sub_distributions)
            ])  # v in np.stack(values, 1)])
            expected = []
            for i in range(batch_size):
                expected.append(
                    np.sum(
                        np.log(
                            np.array([
                                probs[j][i][values[j][i]]
                                for j in range(num_sub_distributions)
                            ]))))
            check(out, expected, decimals=4)

            # Test entropy outputs.
            out = multi_categorical.entropy()
            expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1)
            check(out, expected_entropy)
예제 #15
0
class ALPGMM(AbstractTeacher):
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 gmm_fitness_func="aic",
                 warm_start=False,
                 nb_em_init=1,
                 fit_rate=250,
                 alp_max_size=None,
                 alp_buffer_size=500,
                 potential_ks=np.arange(2, 11, 1),
                 random_task_ratio=0.2,
                 nb_bootstrap=None,
                 initial_dist=None):
        '''
            Absolute Learning Progress - Gaussian Mixture Model (https://arxiv.org/abs/1910.07224).

            Args:
                gmm_fitness_func: Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians.
                warm_start: Restart new fit by initializing with last fit
                nb_em_init: Number of Expectation-Maximization trials when fitting
                fit_rate: Number of episodes between two fit of the GMM
                alp_max_size: Maximum number of episodes stored
                alp_buffer_size: Maximal number of episodes to account for when computing ALP
                potential_ks: Range of number of Gaussians to try when fitting the GMM
                random_task_ratio: Ratio of randomly sampled tasks VS tasks sampling using GMM
                nb_bootstrap: Number of bootstrapping episodes, must be >= to fit_rate
                initial_dist: Initial Gaussian distribution. If None, bootstrap with random tasks
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)

        # Range of number of Gaussians to try when fitting the GMM
        self.potential_ks = potential_ks
        # Restart new fit by initializing with last fit
        self.warm_start = warm_start
        # Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians.
        self.gmm_fitness_func = gmm_fitness_func
        # Number of Expectation-Maximization trials when fitting
        self.nb_em_init = nb_em_init
        # Number of episodes between two fit of the GMM
        self.fit_rate = fit_rate
        self.nb_bootstrap = nb_bootstrap if nb_bootstrap is not None else fit_rate  # Number of bootstrapping episodes, must be >= to fit_rate
        self.initial_dist = initial_dist  # Initial Gaussian distribution. If None, bootstrap with random tasks

        # Ratio of randomly sampled tasks VS tasks sampling using GMM
        self.random_task_ratio = random_task_ratio
        self.random_task_generator = Box(self.mins,
                                         self.maxs,
                                         dtype=np.float32)
        self.random_task_generator.seed(self.seed)

        # Maximal number of episodes to account for when computing ALP
        alp_max_size = alp_max_size
        alp_buffer_size = alp_buffer_size

        # Init ALP computer
        self.alp_computer = EmpiricalALPComputer(len(mins),
                                                 max_size=alp_max_size,
                                                 buffer_size=alp_buffer_size)

        self.tasks = []
        self.alps = []
        self.tasks_alps = []

        # Init GMMs
        self.potential_gmms = [self.init_gmm(k) for k in self.potential_ks]
        self.gmm = None

        # Boring book-keeping
        self.bk = {
            'weights': [],
            'covariances': [],
            'means': [],
            'tasks_alps': [],
            'tasks_lps': [],
            'episodes': [],
            'tasks_origin': []
        }

    def init_gmm(self, nb_gaussians):
        '''
            Init the GMM given the number of gaussians.
        '''
        return GMM(n_components=nb_gaussians,
                   covariance_type='full',
                   random_state=self.seed,
                   warm_start=self.warm_start,
                   n_init=self.nb_em_init)

    def get_nb_gmm_params(self, gmm):
        '''
            Assumes full covariance.
            See https://stats.stackexchange.com/questions/229293/the-number-of-parameters-in-gaussian-mixture-model
        '''
        nb_gmms = gmm.get_params()['n_components']
        d = len(self.mins)
        params_per_gmm = (d * d - d) / 2 + 2 * d + 1
        return nb_gmms * params_per_gmm - 1

    def episodic_update(self, task, reward, is_success):
        self.tasks.append(task)

        is_update_time = False

        # Compute corresponding ALP
        alp, lp = self.alp_computer.compute_alp(task, reward)
        self.alps.append(alp)

        # Concatenate task vector with ALP dimension
        self.tasks_alps.append(np.array(task.tolist() + [self.alps[-1]]))

        if len(self.tasks
               ) >= self.nb_bootstrap:  # If initial bootstrapping is done
            if (len(self.tasks) % self.fit_rate) == 0:  # Time to fit
                is_update_time = True
                # 1 - Retrieve last <fit_rate> (task, reward) pairs
                cur_tasks_alps = np.array(self.tasks_alps[-self.fit_rate:])

                # 2 - Fit batch of GMMs with varying number of Gaussians
                self.potential_gmms = [
                    g.fit(cur_tasks_alps) for g in self.potential_gmms
                ]

                # 3 - Compute fitness and keep best GMM
                fitnesses = []
                if self.gmm_fitness_func == 'bic':  # Bayesian Information Criterion
                    fitnesses = [
                        m.bic(cur_tasks_alps) for m in self.potential_gmms
                    ]
                elif self.gmm_fitness_func == 'aic':  # Akaike Information Criterion
                    fitnesses = [
                        m.aic(cur_tasks_alps) for m in self.potential_gmms
                    ]
                elif self.gmm_fitness_func == 'aicc':  # Modified AIC
                    n = self.fit_rate
                    fitnesses = []
                    for l, m in enumerate(self.potential_gmms):
                        k = self.get_nb_gmm_params(m)
                        penalty = (2 * k * (k + 1)) / (n - k - 1)
                        fitnesses.append(m.aic(cur_tasks_alps) + penalty)
                else:
                    raise NotImplementedError
                    exit(1)
                self.gmm = self.potential_gmms[np.argmin(fitnesses)]

                # book-keeping
                self.bk['weights'].append(self.gmm.weights_.copy())
                self.bk['covariances'].append(self.gmm.covariances_.copy())
                self.bk['means'].append(self.gmm.means_.copy())
                self.bk['tasks_alps'] = self.tasks_alps
                self.bk['tasks_lps'].append(lp)
                self.bk['episodes'].append(len(self.tasks))
        return is_update_time

    def sample_task(self):
        task_origin = None
        if len(self.tasks) < self.nb_bootstrap or self.random_state.random(
        ) < self.random_task_ratio or self.gmm is None:
            if self.initial_dist and len(
                    self.tasks
            ) < self.nb_bootstrap:  # bootstrap in initial dist
                # Expert bootstrap Gaussian task sampling
                new_task = self.random_state.multivariate_normal(
                    self.initial_dist['mean'], self.initial_dist['variance'])
                new_task = np.clip(new_task, self.mins,
                                   self.maxs).astype(np.float32)
                task_origin = -2  # -2 = task originates from initial bootstrap gaussian sampling
            else:
                # Random task sampling
                new_task = self.random_task_generator.sample()
                task_origin = -1  # -1 = task originates from random sampling
        else:
            # ALP-based task sampling
            # 1 - Retrieve the mean ALP value of each Gaussian in the GMM
            self.alp_means = []
            for pos, _, w in zip(self.gmm.means_, self.gmm.covariances_,
                                 self.gmm.weights_):
                self.alp_means.append(pos[-1])

            # 2 - Sample Gaussian proportionally to its mean ALP
            idx = proportional_choice(self.alp_means,
                                      self.random_state,
                                      eps=0.0)
            task_origin = idx

            # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension
            new_task = self.random_state.multivariate_normal(
                self.gmm.means_[idx], self.gmm.covariances_[idx])[:-1]
            new_task = np.clip(new_task, self.mins,
                               self.maxs).astype(np.float32)

        # boring book-keeping
        self.bk['tasks_origin'].append(task_origin)
        return new_task

    def is_non_exploratory_task_sampling_available(self):
        return self.gmm is not None

    def non_exploratory_task_sampling(self):
        # 1 - Retrieve the mean ALP value of each Gaussian in the GMM
        alp_means = []
        for pos, _, w in zip(self.gmm.means_, self.gmm.covariances_,
                             self.gmm.weights_):
            alp_means.append(pos[-1])

        # 2 - Sample Gaussian proportionally to its mean ALP
        idx = proportional_choice(alp_means, self.random_state, eps=0.0)

        # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension
        new_task = self.random_state.multivariate_normal(
            self.gmm.means_[idx], self.gmm.covariances_[idx])[:-1]
        new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32)
        return {
            "task": new_task,
            "infos": {
                "bk_index": len(self.bk[list(self.bk.keys())[0]]) - 1,
                "task_infos": idx
            }
        }
예제 #16
0
def backward_template(settings: Dict[str, Any],
                      splits_args: List[Dict[str, Any]]) -> None:
    """
    Template to test that the backward() function correctly computes gradients. We don't
    actually compare the gradients against baseline values, instead we just check that
    the gradient of the loss for task i is non-zero for all copies that i is assigned
    to, and zero for all copies i isn't assigned to, for each i. To keep things simple,
    we define each task loss as the squared norm of the output for inputs from the given
    task.
    """

    # Set up case.
    dim = settings["obs_dim"] + settings["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(settings["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network.
    network = BaseMultiTaskSplittingNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=settings["num_tasks"],
        num_layers=settings["num_layers"],
        hidden_size=hidden_size,
        device=settings["device"],
    )

    # Split the network according to `splits_args`.
    for split_args in splits_args:
        network.split(**split_args)

    # Re-initialize the new copies so different tasks will actually have different
    # corresponding functions.
    state_dict = network.state_dict()
    for region in range(network.num_regions):
        for copy in range(1, int(network.splitting_map.num_copies[region])):
            weight_name = "regions.%d.%d.0.weight" % (region, copy)
            bias_name = "regions.%d.%d.0.bias" % (region, copy)
            state_dict[weight_name] = torch.rand(state_dict[weight_name].shape)
            state_dict[bias_name] = torch.rand(state_dict[bias_name].shape)
    network.load_state_dict(state_dict)

    # Construct batch of observations concatenated with one-hot task vectors.
    obs, task_indices = get_obs_batch(
        batch_size=settings["num_processes"],
        obs_space=observation_subspace,
        num_tasks=settings["num_tasks"],
    )

    # Get output of network and compute task losses.
    output = network(obs, task_indices)
    task_losses = {i: None for i in range(settings["num_tasks"])}
    for task in range(settings["num_tasks"]):
        for current_out, current_task in zip(output, task_indices):
            if current_task == task:
                if task_losses[task] is not None:
                    task_losses[task] += torch.sum(current_out**2)
                else:
                    task_losses[task] = torch.sum(current_out**2)

    # Test gradients.
    for task in range(settings["num_tasks"]):
        network.zero_grad()
        if task_losses[task] is None:
            continue

        task_losses[task].backward(retain_graph=True)
        for region in range(len(network.regions)):
            for copy in range(int(network.splitting_map.num_copies[region])):
                for param in network.regions[region][copy].parameters():
                    zero = torch.zeros(param.grad.shape)
                    if network.splitting_map.copy[region, task] == copy:
                        assert not torch.allclose(param.grad, zero)
                    else:
                        assert torch.allclose(param.grad, zero)
예제 #17
0
def test_actor_worker(config: Dict, seeds: List[np.random.SeedSequence]):
    """
  Function to run an isolated actor workers and manually test it out in the cli.
  """
    from gym.spaces import Box, Discrete
    from asrel.core.utils import get_actor_args_from_config, take_tensor_from_dict
    from asrel.core.workers.actor import ActorWorker
    import asrel.core.workers.events as events

    actor_args = get_actor_args_from_config(config["actor"])

    print(f"Testing Actor Worker with args: {actor_args}")

    num_workers = actor_args.get("num_workers", 1)
    input_queue_len = 8
    input_space = Box(-10, 10, (6, ), np.float32)

    input_space.seed(0)
    output_space = Discrete(3)

    print("Creating workers...")

    actor_input_queues = [
        mp.Queue(maxsize=input_queue_len) for _ in range(num_workers)
    ]
    actor_shared_output_queue = mp.Queue(maxsize=num_workers * input_queue_len)
    actor_worker_seed_seqs = seeds["actor"].spawn(num_workers)

    actor_workers = [
        ActorWorker(
            input_queue=actor_input_queues[idx],
            output_queue=actor_shared_output_queue,
            seed_seq=actor_worker_seed_seqs[idx],
            input_space=input_space,
            output_space=output_space,
            index=idx,
            **actor_args,
        ) for idx in range(num_workers)
    ]

    for worker in actor_workers:
        worker.start()

    try:
        while True:
            task = int(
                input(
                    "0 - Choose Action, 1 - Sync Networks, 2 - Update Params: "
                ))
            if task == 0:
                worker_idx = int(input(" worker: ", ))

                num_obs = int(input(" # of obs: "))
                obs = torch.tensor(
                    [input_space.sample() for _ in range(num_obs)]).cuda()
                print(f" obs:\n{obs}")
                env_worker_idx = int(input(" env worker:   "))
                env_sub_idx = int(input("     subenv:   "))
                greedy = input(" greedy (y/n): ").lower() == "y"
                actor_input_queues[worker_idx].put({
                    "type":
                    events.ACTOR_CHOOSE_ACTION_TASK,
                    "observation":
                    obs,
                    "greedy":
                    greedy,
                    "env_idx": (env_worker_idx, env_sub_idx),
                })
                out = actor_shared_output_queue.get()
                out_action = take_tensor_from_dict(out, "action")
                print(f"worker {worker_idx}:")
                print({**out, "action": out_action})

            elif task == 1:
                state_dicts = json.loads(input("State Dictionaries: "))
                for q in actor_input_queues:
                    q.put({
                        "type": events.ACTOR_SYNC_NETWORKS_TASK,
                        "state_dicts": state_dicts,
                    })
            elif task == 2:
                params = json.loads(input("Params: "))
                for q in actor_input_queues:
                    q.put({
                        "type": events.ACTOR_UPDATE_PARAMS_TASK,
                        **params,
                    })

    except (KeyboardInterrupt, Exception) as e:
        print()
        print("Terminating workers...")
        for worker in actor_workers:
            worker.terminate()
        print(e)
    else:
        print("Closing worker...")
        for worker in actor_workers:
            worker.close()

    for worker in actor_workers:
        worker.join()
예제 #18
0
class ContinuousGridEnv(gym.Env):
    def __init__(self,
                 r=None,
                 size_x=4,
                 size_y=4,
                 T=50,
                 random_born=False,
                 state_indices=None,
                 random_act_prob=0.0,
                 sigma=1.0,
                 terminal_states=[],
                 seed=0,
                 add_time=False,
                 **kwargs):
        self.size_x = size_x
        self.size_y = size_y
        self.terminal_states = terminal_states
        self.r = r
        self.range_x = (0, size_x)
        self.range_y = (0, size_y)
        self.random_act_prob = random_act_prob
        self.sigma = sigma
        self.state_indices = state_indices
        self.T = T

        self.observation_space = Box(low=np.array([0, 0]),
                                     high=np.array([size_x, size_y]),
                                     dtype=np.float32)
        self.action_space = Box(low=np.array([-1, -1]),
                                high=np.array([1, 1]),
                                dtype=np.float32)

        self.seed(seed)
        self.action_space.seed(seed)
        self.random_born = random_born

    def set_reward_function(self, r):
        self.r = r

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self, n=1):
        if self.random_born:
            self.s = np.random.uniform((0, 0), (self.size_x, self.size_y),
                                       size=(n, 2))
        else:
            self.s = np.zeros((n, 2), dtype=np.float32)

        self.n = n
        self.t = 0
        return self.s.copy()

    def step(self, action):
        change_action_prob = (np.random.uniform(0, 1, size=(self.n)) <
                              self.random_act_prob).reshape(-1, 1)
        action = change_action_prob * (action + self.sigma * np.random.randn(self.n, 2)) \
                + (1-change_action_prob) * action
        self.s += action
        self.s[:, 0] = np.clip(self.s[:, 0], 0, self.size_x)
        self.s[:, 1] = np.clip(self.s[:, 1], 0, self.size_y)
        self.t += 1
        done = (self.t >= self.T)
        if self.r is None:  # for adv IRL
            r = np.zeros((self.n, ))
        else:  # for SMM IRL
            r = self.r(self.s)

        return self.s.copy(), r, done, None
예제 #19
0
def test_forward_single() -> None:
    """
    Test forward() when all regions of the splitting network are fully shared except
    one. The function computed by the network should be f(x) = 3 * tanh(2 * tanh(x + 1)
    + 2) + 3 for tasks 0 and 1 and f(x) = 3 * tanh(-2 * tanh(x + 1) - 2) + 3 for tasks 2
    and 3.
    """

    # Set up case.
    dim = BASE_SETTINGS["obs_dim"] + BASE_SETTINGS["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(BASE_SETTINGS["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network.
    network = BaseMultiTaskSplittingNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=BASE_SETTINGS["num_tasks"],
        num_layers=BASE_SETTINGS["num_layers"],
        hidden_size=hidden_size,
        device=BASE_SETTINGS["device"],
    )

    # Split the network at the second layer. Tasks 0 and 1 stay assigned to the original
    # copy and tasks 2 and 3 are assigned to the new copy.
    network.split(1, 0, [0, 1], [2, 3])

    # Set network weights.
    state_dict = network.state_dict()
    for i in range(BASE_SETTINGS["num_layers"]):
        weight_name = "regions.%d.0.0.weight" % i
        bias_name = "regions.%d.0.0.bias" % i
        state_dict[weight_name] = torch.Tensor((i + 1) * np.identity(dim))
        state_dict[bias_name] = torch.Tensor((i + 1) * np.ones(dim))
    weight_name = "regions.1.1.0.weight"
    bias_name = "regions.1.1.0.bias"
    state_dict[weight_name] = torch.Tensor(-2 * np.identity(dim))
    state_dict[bias_name] = torch.Tensor(-2 * np.ones(dim))
    network.load_state_dict(state_dict)

    # Construct batch of observations concatenated with one-hot task vectors.
    obs, task_indices = get_obs_batch(
        batch_size=BASE_SETTINGS["num_processes"],
        obs_space=observation_subspace,
        num_tasks=BASE_SETTINGS["num_tasks"],
    )

    # Get output of network.
    output = network(obs, task_indices)

    # Computed expected output of network.
    expected_output = torch.zeros(obs.shape)
    for i, (ob, task) in enumerate(zip(obs, task_indices)):
        if task in [0, 1]:
            expected_output[i] = 3 * torch.tanh(2 * torch.tanh(ob + 1) + 2) + 3
        elif task in [2, 3]:
            expected_output[i] = 3 * torch.tanh(-2 * torch.tanh(ob + 1) -
                                                2) + 3
        else:
            raise NotImplementedError

    # Test output of network.
    assert torch.allclose(output, expected_output)
예제 #20
0
def gradients_template(settings: Dict[str, Any],
                       splits_args: List[Dict[str, Any]]) -> None:
    """
    Template to test that `get_task_grads()` correctly computes task-specific gradients
    at each region of the network. For simplicity we compute the loss as half of the
    squared norm of the output, and we make the following assumptions: each layer has
    the same size, the activation function is Tanh for each layer, and the final layer
    has no activation.
    """

    # Set up case.
    dim = settings["obs_dim"] + settings["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(settings["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network.
    network = BaseMultiTaskSplittingNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=settings["num_tasks"],
        num_layers=settings["num_layers"],
        hidden_size=hidden_size,
        device=settings["device"],
    )

    # Split the network according to `splits_args`.
    for split_args in splits_args:
        network.split(**split_args)

    # Re-initialize the new copies so different tasks will actually have different
    # corresponding functions.
    state_dict = network.state_dict()
    for region in range(network.num_regions):
        for copy in range(1, int(network.splitting_map.num_copies[region])):
            weight_name = "regions.%d.%d.0.weight" % (region, copy)
            bias_name = "regions.%d.%d.0.bias" % (region, copy)
            state_dict[weight_name] = torch.rand(state_dict[weight_name].shape)
            state_dict[bias_name] = torch.rand(state_dict[bias_name].shape)
    network.load_state_dict(state_dict)

    # Register forward hooks to get activations later from each copy of each region.
    activation = {}

    def get_activation(name):
        def hook(model, ins, outs):
            activation[name] = outs.detach()

        return hook

    for region in range(network.num_regions):
        for copy in range(int(network.splitting_map.num_copies[region])):
            name = "regions.%d.%d" % (region, copy)
            network.regions[region][copy].register_forward_hook(
                get_activation(name))

    # Construct batch of observations concatenated with one-hot task vectors.
    obs, task_indices = get_obs_batch(
        batch_size=settings["num_processes"],
        obs_space=observation_subspace,
        num_tasks=settings["num_tasks"],
    )

    # Get output of network and compute task gradients.
    output = network(obs, task_indices)
    task_losses = torch.zeros(settings["num_tasks"])
    for task in range(settings["num_tasks"]):
        for current_out, current_task in zip(output, task_indices):
            if current_task == task:
                task_losses[task] += 0.5 * torch.sum(current_out**2)

    task_grads = network.get_task_grads(task_losses)

    def get_task_activations(r, t, tasks):
        """ Helper function to get activations from specific regions. """

        c = network.splitting_map.copy[r, t]
        copy_indices = network.splitting_map.copy[r, tasks]
        sorted_copy_indices, copy_permutation = torch.sort(copy_indices)
        sorted_tasks = tasks[copy_permutation]
        batch_indices = (sorted_copy_indices == c).nonzero().squeeze(-1)
        task_batch_indices = sorted_tasks[batch_indices]
        current_task_indices = (task_batch_indices == t).nonzero().squeeze(-1)
        activations = activation["regions.%d.%d" %
                                 (r, c)][current_task_indices]

        return activations

    # Compute expected gradients.
    state_dict = network.state_dict()
    expected_task_grads = torch.zeros(
        (settings["num_tasks"], network.num_regions, network.max_region_size))
    for task in range(settings["num_tasks"]):

        # Get output from current task.
        task_input_indices = (task_indices == task).nonzero().squeeze(-1)
        task_output = output[task_input_indices]

        # Clear local gradients.
        local_grad = {}

        for region in reversed(range(network.num_regions)):

            # Get copy index and layer input.
            copy = network.splitting_map.copy[region, task]
            if region > 0:
                layer_input = get_task_activations(region - 1, task,
                                                   task_indices)
            else:
                layer_input = obs[task_input_indices]

            # Compute local gradient first.
            if region == network.num_regions - 1:
                local_grad[region] = -task_output
            else:
                layer_output = get_task_activations(region, task, task_indices)
                local_grad[region] = torch.zeros(len(layer_output), dim)
                next_copy = network.splitting_map.copy[region + 1, task]
                weights = state_dict["regions.%d.%d.0.weight" %
                                     (region + 1, next_copy)]
                for i in range(dim):
                    for j in range(dim):
                        local_grad[region][:, i] += (
                            local_grad[region + 1][:, j] * weights[j, i])
                local_grad[region] = local_grad[region] * (1 - layer_output**2)

            # Compute gradient from local gradients.
            grad = torch.zeros(dim, dim + 1)
            for i in range(dim):
                for j in range(dim):
                    grad[i, j] = torch.sum(-local_grad[region][:, i] *
                                           layer_input[:, j])
                grad[i, dim] = torch.sum(-local_grad[region][:, i])

            # Rearrange weights and biases. Should be all weights, then all biases.
            weights = torch.reshape(grad[:, :-1], (-1, ))
            biases = torch.reshape(grad[:, -1], (-1, ))
            grad = torch.cat([weights, biases])
            expected_task_grads[task, region, :len(grad)] = grad

    # Test gradients.
    assert torch.allclose(task_grads, expected_task_grads, atol=2e-5)
예제 #21
0
def meta_backward_template(
    settings: Dict[str, Any],
    splits_args: List[Dict[str, Any]],
    alpha: List[torch.Tensor],
) -> None:
    """
    Template to test that the backward() function correctly computes gradients. We don't
    actually compare the gradients against baseline values, instead we just check that
    the gradients are non-zero for each of the alpha values and zero for the parameters
    in each region.
    """

    # Construct multi-task network.
    multitask_network = BaseMultiTaskSplittingNetwork(
        input_size=settings["input_size"],
        output_size=settings["output_size"],
        num_tasks=settings["num_tasks"],
        num_layers=settings["num_layers"],
        hidden_size=settings["hidden_size"],
        device=settings["device"],
    )

    # Split the network according to `splits_args`.
    for split_args in splits_args:
        multitask_network.split(**split_args)

    # Construct MetaSplittingNetwork from BaseMultiTaskSplittingNetwork.
    meta_network = MetaSplittingNetwork(
        multitask_network,
        num_test_tasks=settings["num_tasks"],
        device=settings["device"],
    )

    # Set alpha weights of meta network.
    for layer in range(meta_network.num_layers):
        meta_network.alpha[layer].data = alpha[layer]

    # Construct batch of observations concatenated with one-hot task vectors.
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(settings["obs_dim"], ))
    observation_subspace.seed(settings["seed"])
    obs, task_indices = get_obs_batch(
        batch_size=settings["num_processes"],
        obs_space=observation_subspace,
        num_tasks=settings["num_tasks"],
    )

    # Get output, compute a dummy loss, and perform backwards call.
    output = meta_network(obs, task_indices)
    loss = torch.sum(output**2)
    meta_network.zero_grad()
    loss.backward()

    # Check that gradients of alpha values are non-zero.
    batch_tasks = task_indices.tolist()
    for layer in range(meta_network.num_layers):
        for task in range(meta_network.num_test_tasks):
            grad = meta_network.alpha[layer].grad[:, task]
            assert grad is not None
            if task in batch_tasks:
                assert torch.all(grad != 0)
            else:
                assert torch.all(grad == 0)

    # Check that gradients of regions are zero.
    for region in range(meta_network.num_regions):
        for copy in range(int(meta_network.splitting_map.num_copies[region])):
            for param in meta_network.regions[region][copy].parameters():
                assert param.grad is None
예제 #22
0
class ADR(AbstractTeacher):
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 step_size,
                 max_reward_thr,
                 min_reward_thr,
                 initial_dist=None,
                 boundary_sampling_p=0.5,
                 queue_len=10,
                 scale_reward=False):
        '''
            Automatic Domain Randomization (https://arxiv.org/abs/1910.07113).

            Args:
                step_size: Size of the growth (or decrease) of a bound at update
                max_reward_thr: Upper reward threshold used to inflate distribution
                min_reward_thr: Lowers reward threshold used to deflate distribution
                initial_dist: The mean of this initial distribution is used as the initial task used by ADR
                boundary_sampling_p: Probability to sample a dimension at a bound
                queue_len: Size of the queue associated to each bound. Once reached, ADR increases or decreases the bound.
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)
        self.nb_dims = len(self.mins)

        # Boundary sampling probability p_r
        self.bound_sampling_p = boundary_sampling_p

        # ADR step size
        self.step_size = step_size

        # Max reward threshold, sampling distribution inflates if mean reward above this
        self.max_reward_threshold = max_reward_thr
        if scale_reward:
            self.max_reward_threshold = np.interp(
                self.max_reward_threshold,
                (self.env_reward_lb, self.env_reward_ub), (0, 1))

        # Min reward threshold, sampling distribution deflates if mean reward below this
        self.min_reward_threshold = min_reward_thr
        if scale_reward:
            self.min_reward_threshold = np.interp(
                self.min_reward_threshold,
                (self.env_reward_lb, self.env_reward_ub), (0, 1))

        # max queue length
        self.window_len = queue_len

        # Set initial task space to predefined calibrated task
        initial_mean, initial_variance = self.get_or_create_dist(initial_dist,
                                                                 mins,
                                                                 maxs,
                                                                 subspace=True)

        # Single task version (as the original paper)
        self.cur_mins = initial_mean
        self.cur_maxs = initial_mean

        self.cur_mins = np.array(self.cur_mins,
                                 dtype=np.float32)  # current min bounds
        self.cur_maxs = np.array(self.cur_maxs,
                                 dtype=np.float32)  # current max bounds
        self.task_space = Box(self.cur_mins, self.cur_maxs, dtype=np.float32)
        self.task_space.seed(self.seed)

        # Init queues, one per task space dimension
        self.min_queues = [
            deque(maxlen=self.window_len) for _ in range(self.nb_dims)
        ]
        self.max_queues = [
            deque(maxlen=self.window_len) for _ in range(self.nb_dims)
        ]

        # Boring book-keeping
        self.episode_nb = 0
        self.bk = {
            'task_space': [(self.cur_mins.copy(), self.cur_maxs.copy())],
            'episodes': []
        }

    def episodic_update(self, task, reward, is_success):
        self.episode_nb += 1

        # check for updates
        for i, (min_q, max_q, cur_min, cur_max) in enumerate(
                zip(self.min_queues, self.max_queues, self.cur_mins,
                    self.cur_maxs)):
            if task[i] == cur_min:  # if the proposed task has the i^th dimension set to min boundary
                min_q.append(reward)
                if len(min_q) == self.window_len:
                    if np.mean(
                            min_q
                    ) >= self.max_reward_threshold:  # decrease min boundary (inflate sampling space)
                        self.cur_mins[i] = max(
                            self.cur_mins[i] - self.step_size, self.mins[i])
                    elif np.mean(
                            min_q
                    ) <= self.min_reward_threshold:  # increase min boundary (deflate sampling space)
                        self.cur_mins[i] = min(
                            self.cur_mins[i] + self.step_size,
                            self.cur_maxs[i])
                    self.min_queues[i] = deque(
                        maxlen=self.window_len)  # reset queue
            if task[i] == cur_max:  # if the proposed task has the i^th dimension set to max boundary
                max_q.append(reward)
                if len(max_q
                       ) == self.window_len:  # queue is full, time to update
                    if np.mean(
                            max_q
                    ) >= self.max_reward_threshold:  # increase max boundary
                        self.cur_maxs[i] = min(
                            self.cur_maxs[i] + self.step_size, self.maxs[i])
                    elif np.mean(
                            max_q
                    ) <= self.min_reward_threshold:  # decrease max boundary
                        self.cur_maxs[i] = max(
                            self.cur_maxs[i] - self.step_size,
                            self.cur_mins[i])
                    self.max_queues[i] = deque(
                        maxlen=self.window_len)  # reset queue

        prev_cur_mins, prev_cur_maxs = self.bk['task_space'][-1]
        if (prev_cur_mins != self.cur_mins).any() or (
                prev_cur_maxs !=
                self.cur_maxs).any():  # were boundaries changed ?
            self.task_space = Box(self.cur_mins,
                                  self.cur_maxs,
                                  dtype=np.float32)
            self.task_space.seed(self.seed)
            # book-keeping only if boundaries were updates
            self.bk['task_space'].append(
                (self.cur_mins.copy(), self.cur_maxs.copy()))
            self.bk['episodes'].append(self.episode_nb)

    def sample_task(self):
        new_task = self.non_exploratory_task_sampling()["task"]
        if self.random_state.random(
        ) < self.bound_sampling_p:  # set random dimension to min or max bound
            idx = self.random_state.randint(0, self.nb_dims)
            is_min_max_capped = np.array([
                self.cur_mins[idx] == self.mins[idx],
                self.cur_maxs[idx] == self.maxs[idx]
            ])
            if not is_min_max_capped.all(
            ):  # both min and max bounds can increase, choose extremum randomly
                if self.random_state.random(
                ) < 0.5:  # skip min bound if already
                    new_task[idx] = self.cur_mins[idx]
                else:
                    new_task[idx] = self.cur_maxs[idx]
            elif not is_min_max_capped[0]:
                new_task[idx] = self.cur_mins[idx]
            elif not is_min_max_capped[1]:
                new_task[idx] = self.cur_maxs[idx]
        return new_task

    def non_exploratory_task_sampling(self):
        return {
            "task": self.task_space.sample(),
            "infos": {
                "bk_index": len(self.bk[list(self.bk.keys())[0]]) - 1,
                "task_infos": None
            }
        }
예제 #23
0
class ALPGMM():
    def __init__(self, mins, maxs, seed=None, params=dict()):
        self.seed = seed
        if not seed:
            self.seed = np.random.randint(42, 424242)
        np.random.seed(self.seed)

        # Task space boundaries
        self.mins = np.array(mins)
        self.maxs = np.array(maxs)

        # Range of number of Gaussians to try when fitting the GMM
        self.potential_ks = np.arange(
            2, 11,
            1) if "potential_ks" not in params else params["potential_ks"]
        # Restart new fit by initializing with last fit
        self.warm_start = False if "warm_start" not in params else params[
            "warm_start"]
        # Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians.
        self.gmm_fitness_fun = "aic" if "gmm_fitness_fun" not in params else params[
            "gmm_fitness_fun"]
        # Number of Expectation-Maximization trials when fitting
        self.nb_em_init = 1 if "nb_em_init" not in params else params[
            'nb_em_init']
        # Number of episodes between two fit of the GMM
        self.fit_rate = 250 if "fit_rate" not in params else params['fit_rate']
        self.nb_random = self.fit_rate  # Number of bootstrapping episodes

        # Ratio of randomly sampled tasks VS tasks sampling using GMM
        self.random_task_ratio = 0.2 if "random_task_ratio" not in params else params[
            "random_task_ratio"]
        self.random_task_generator = Box(self.mins,
                                         self.maxs,
                                         dtype=np.float32)
        self.random_task_generator.seed(self.seed)

        # Maximal number of episodes to account for when computing ALP
        alp_max_size = None if "alp_max_size" not in params else params[
            "alp_max_size"]
        alp_buffer_size = 500 if "alp_buffer_size" not in params else params[
            "alp_buffer_size"]

        # Init ALP computer
        self.alp_computer = EmpiricalALPComputer(len(mins),
                                                 max_size=alp_max_size,
                                                 buffer_size=alp_buffer_size)

        self.tasks = []
        self.alps = []
        self.tasks_alps = []

        # Init GMMs
        self.potential_gmms = [self.init_gmm(k) for k in self.potential_ks]
        self.gmm = None

        # Boring book-keeping
        self.bk = {
            'weights': [],
            'covariances': [],
            'means': [],
            'tasks_alps': [],
            'tasks_lps': [],
            'episodes': [],
            'tasks_origin': []
        }

    def init_gmm(self, nb_gaussians):
        return GMM(n_components=nb_gaussians,
                   covariance_type='full',
                   random_state=self.seed,
                   warm_start=self.warm_start,
                   n_init=self.nb_em_init)

    def get_nb_gmm_params(self, gmm):
        # assumes full covariance
        # see https://stats.stackexchange.com/questions/229293/the-number-of-parameters-in-gaussian-mixture-model
        nb_gmms = gmm.get_params()['n_components']
        d = len(self.mins)
        params_per_gmm = (d * d - d) / 2 + 2 * d + 1
        return nb_gmms * params_per_gmm - 1

    def update(self, task, reward):
        self.tasks.append(task)

        is_update_time = False

        # Compute corresponding ALP
        alp, lp = self.alp_computer.compute_alp(task, reward)
        self.alps.append(alp)

        # Concatenate task vector with ALP dimension
        self.tasks_alps.append(np.array(task.tolist() + [self.alps[-1]]))

        if len(self.tasks
               ) >= self.nb_random:  # If initial bootstrapping is done
            if (len(self.tasks) % self.fit_rate) == 0:  # Time to fit
                is_update_time = True
                # 1 - Retrieve last <fit_rate> (task, reward) pairs
                cur_tasks_alps = np.array(self.tasks_alps[-self.fit_rate:])

                # 2 - Fit batch of GMMs with varying number of Gaussians
                self.potential_gmms = [
                    g.fit(cur_tasks_alps) for g in self.potential_gmms
                ]

                # 3 - Compute fitness and keep best GMM
                fitnesses = []
                if self.gmm_fitness_fun == 'bic':  # Bayesian Information Criterion
                    fitnesses = [
                        m.bic(cur_tasks_alps) for m in self.potential_gmms
                    ]
                elif self.gmm_fitness_fun == 'aic':  # Akaike Information Criterion
                    fitnesses = [
                        m.aic(cur_tasks_alps) for m in self.potential_gmms
                    ]
                elif self.gmm_fitness_fun == 'aicc':  # Modified AIC
                    n = self.fit_rate
                    fitnesses = []
                    for l, m in enumerate(self.potential_gmms):
                        k = self.get_nb_gmm_params(m)
                        penalty = (2 * k * (k + 1)) / (n - k - 1)
                        fitnesses.append(m.aic(cur_tasks_alps) + penalty)
                else:
                    raise NotImplementedError
                    exit(1)
                self.gmm = self.potential_gmms[np.argmin(fitnesses)]

                # book-keeping
                self.bk['weights'].append(self.gmm.weights_.copy())
                self.bk['covariances'].append(self.gmm.covariances_.copy())
                self.bk['means'].append(self.gmm.means_.copy())
                self.bk['tasks_alps'] = self.tasks_alps
                self.bk['tasks_lps'].append(lp)
                self.bk['episodes'].append(len(self.tasks))
        return is_update_time

    def sample_task(self):
        task_origin = None
        if (len(self.tasks) < self.nb_random) or (np.random.random() <
                                                  self.random_task_ratio):
            # Random task sampling
            new_task = self.random_task_generator.sample()
            task_origin = -1  # -1 = task originates from random sampling
        else:
            # ALP-based task sampling

            # 1 - Retrieve the mean ALP value of each Gaussian in the GMM
            self.alp_means = []
            for pos, _, w in zip(self.gmm.means_, self.gmm.covariances_,
                                 self.gmm.weights_):
                self.alp_means.append(pos[-1])

            # 2 - Sample Gaussian proportionally to its mean ALP
            idx = proportional_choice(self.alp_means, eps=0.0)
            task_origin = idx

            # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension
            new_task = np.random.multivariate_normal(
                self.gmm.means_[idx], self.gmm.covariances_[idx])[:-1]
            new_task = np.clip(new_task, self.mins,
                               self.maxs).astype(np.float32)

        # boring book-keeping
        self.bk['tasks_origin'].append(task_origin)
        return new_task

    def dump(self, dump_dict):
        dump_dict.update(self.bk)
        return dump_dict
예제 #24
0
    def test_squashed_gaussian(self):
        """Tests the SquashedGaussian ActionDistribution for all frameworks."""
        input_space = Box(-2.0, 2.0, shape=(2000, 10))
        input_space.seed(42)

        low, high = -2.0, 1.0

        for fw, sess in framework_iterator(session=True):
            cls = SquashedGaussian if fw != "torch" else TorchSquashedGaussian

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(cls,
                                 input_space.shape,
                                 fw=fw,
                                 sess=sess,
                                 bounds=(low, high))

            # Batch of size=n and deterministic.
            inputs = input_space.sample()
            means, _ = np.split(inputs, 2, axis=-1)
            squashed_distribution = cls(inputs, {}, low=low, high=high)
            expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low
            # Sample n times, expect always mean value (deterministic draw).
            out = squashed_distribution.deterministic_sample()
            check(out, expected)

            # Batch of size=n and non-deterministic -> expect roughly the mean.
            inputs = input_space.sample()
            means, log_stds = np.split(inputs, 2, axis=-1)
            squashed_distribution = cls(inputs, {}, low=low, high=high)
            expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low
            values = squashed_distribution.sample()
            if sess:
                values = sess.run(values)
            else:
                values = values.numpy()
            self.assertTrue(np.max(values) <= high)
            self.assertTrue(np.min(values) >= low)

            check(np.mean(values), expected.mean(), decimals=1)

            # Test log-likelihood outputs.
            sampled_action_logp = squashed_distribution.logp(
                values if fw != "torch" else torch.Tensor(values))
            if sess:
                sampled_action_logp = sess.run(sampled_action_logp)
            else:
                sampled_action_logp = sampled_action_logp.numpy()
            # Convert to parameters for distr.
            stds = np.exp(
                np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT))
            # Unsquash values, then get log-llh from regular gaussian.
            # atanh_in = np.clip((values - low) / (high - low) * 2.0 - 1.0,
            #   -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER)
            normed_values = (values - low) / (high - low) * 2.0 - 1.0
            save_normed_values = np.clip(normed_values, -1.0 + SMALL_NUMBER,
                                         1.0 - SMALL_NUMBER)
            unsquashed_values = np.arctanh(save_normed_values)
            log_prob_unsquashed = np.sum(
                np.log(norm.pdf(unsquashed_values, means, stds)), -1)
            log_prob = log_prob_unsquashed - np.sum(
                np.log(1 - np.tanh(unsquashed_values)**2), axis=-1)
            check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05)

            # NN output.
            means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0],
                              [-0.1, -0.2, -0.3, -0.4, -1.0]])
            log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0],
                                 [0.7, -0.3, 0.4, -0.9, 2.0]])
            squashed_distribution = cls(
                inputs=np.concatenate([means, log_stds], axis=-1),
                model={},
                low=low,
                high=high,
            )
            # Convert to parameters for distr.
            stds = np.exp(log_stds)
            # Values to get log-likelihoods for.
            values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05],
                               [-0.9, -0.2, 0.4, -0.1, -1.05]])

            # Unsquash values, then get log-llh from regular gaussian.
            unsquashed_values = np.arctanh((values - low) /
                                           (high - low) * 2.0 - 1.0)
            log_prob_unsquashed = np.sum(
                np.log(norm.pdf(unsquashed_values, means, stds)), -1)
            log_prob = log_prob_unsquashed - np.sum(
                np.log(1 - np.tanh(unsquashed_values)**2), axis=-1)

            outs = squashed_distribution.logp(
                values if fw != "torch" else torch.Tensor(values))
            if sess:
                outs = sess.run(outs)
            check(outs, log_prob, decimals=4)
예제 #25
0
    def test_diag_gaussian(self):
        """Tests the DiagGaussian ActionDistribution for all frameworks."""
        input_space = Box(-2.0, 1.0, shape=(2000, 10))
        input_space.seed(42)

        for fw, sess in framework_iterator(session=True):
            cls = DiagGaussian if fw != "torch" else TorchDiagGaussian

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(cls, input_space.shape, fw=fw, sess=sess)

            # Batch of size=n and deterministic.
            inputs = input_space.sample()
            means, _ = np.split(inputs, 2, axis=-1)
            diag_distribution = cls(inputs, {})
            expected = means
            # Sample n times, expect always mean value (deterministic draw).
            out = diag_distribution.deterministic_sample()
            check(out, expected)

            # Batch of size=n and non-deterministic -> expect roughly the mean.
            inputs = input_space.sample()
            means, log_stds = np.split(inputs, 2, axis=-1)
            diag_distribution = cls(inputs, {})
            expected = means
            values = diag_distribution.sample()
            if sess:
                values = sess.run(values)
            else:
                values = values.numpy()
            check(np.mean(values), expected.mean(), decimals=1)

            # Test log-likelihood outputs.
            sampled_action_logp = diag_distribution.logp(
                values if fw != "torch" else torch.Tensor(values))
            if sess:
                sampled_action_logp = sess.run(sampled_action_logp)
            else:
                sampled_action_logp = sampled_action_logp.numpy()

            # NN output.
            means = np.array(
                [[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]],
                dtype=np.float32,
            )
            log_stds = np.array(
                [[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]],
                dtype=np.float32,
            )

            diag_distribution = cls(inputs=np.concatenate([means, log_stds],
                                                          axis=-1),
                                    model={})
            # Convert to parameters for distr.
            stds = np.exp(log_stds)
            # Values to get log-likelihoods for.
            values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05],
                               [-0.9, -0.2, 0.4, -0.1, -1.05]])

            # get log-llh from regular gaussian.
            log_prob = np.sum(np.log(norm.pdf(values, means, stds)), -1)

            outs = diag_distribution.logp(
                values if fw != "torch" else torch.Tensor(values))
            if sess:
                outs = sess.run(outs)
            check(outs, log_prob, decimals=4)
예제 #26
0
def test_forward_obs_only() -> None:
    """
    Test forward() when each task-specific output head multiplies the shared trunk
    output by some constant factor, and the task index is not included in the input.
    """

    # Set up case.
    dim = SETTINGS["obs_dim"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(SETTINGS["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim
    num_shared_layers = 1
    include_task_index = False

    # Construct network and set weights of each output head explicitly. We want to make
    # it so that each layer in the shared trunk computes an identity function (plus the
    # nonlinearity), f_i(x) = x * i + i (with broadcasted operations), where the i-th
    # output head is f_i.
    network = MultiTaskTrunkNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=SETTINGS["num_tasks"],
        num_shared_layers=num_shared_layers,
        num_task_layers=SETTINGS["num_task_layers"],
        hidden_size=hidden_size,
        downscale_last_layer=True,
        device=SETTINGS["device"],
    )

    # Set shared trunk weights.
    trunk_state_dict = network.trunk.state_dict()
    trunk_state_dict["0.0.weight"] = torch.Tensor(np.identity(hidden_size))
    trunk_state_dict["0.0.bias"] = torch.zeros(hidden_size)
    network.trunk.load_state_dict(trunk_state_dict)

    # Set task-specific weights.
    for i in range(SETTINGS["num_tasks"]):

        # Set weights.
        state_dict = network.output_heads[i].state_dict()
        state_dict["0.0.weight"] = torch.Tensor(i * np.identity(hidden_size))
        state_dict["0.0.bias"] = i * torch.ones(hidden_size)
        network.output_heads[i].load_state_dict(state_dict)

    # Construct batch of observations concatenated with one-hot task vectors.
    obs, task_indices = get_obs_batch(
        batch_size=SETTINGS["num_processes"],
        obs_space=observation_subspace,
        num_tasks=SETTINGS["num_tasks"],
    )
    obs_only = obs[:, :dim]

    # Get output of network.
    output = network(obs_only, task_indices)

    # Construct expected action distribution of network.
    expected_output_list = []
    for i, task_index in enumerate(task_indices):
        expected_output_list.append(
            torch.tanh(obs_only[i]) * task_index + task_index)
    expected_output = torch.stack(expected_output_list)

    # Test output of network.
    assert torch.allclose(output, expected_output)
예제 #27
0
    def test_multi_action_distribution(self):
        """Tests the MultiActionDistribution (across all frameworks)."""
        batch_size = 1000
        input_space = Tuple([
            Box(-10.0, 10.0, shape=(batch_size, 4)),
            Box(
                -2.0,
                2.0,
                shape=(
                    batch_size,
                    6,
                ),
            ),
            Dict({"a": Box(-1.0, 1.0, shape=(batch_size, 4))}),
        ])
        input_space.seed(42)
        std_space = Box(
            -0.05,
            0.05,
            shape=(
                batch_size,
                3,
            ),
        )
        std_space.seed(42)

        low, high = -1.0, 1.0
        value_space = Tuple([
            Box(0, 3, shape=(batch_size, ), dtype=np.int32),
            Box(-2.0, 2.0, shape=(batch_size, 3), dtype=np.float32),
            Dict({"a": Box(0.0, 1.0, shape=(batch_size, 2),
                           dtype=np.float32)}),
        ])
        value_space.seed(42)

        for fw, sess in framework_iterator(session=True):
            if fw == "torch":
                cls = TorchMultiActionDistribution
                child_distr_cls = [
                    TorchCategorical,
                    TorchDiagGaussian,
                    partial(TorchBeta, low=low, high=high),
                ]
            else:
                cls = MultiActionDistribution
                child_distr_cls = [
                    Categorical,
                    DiagGaussian,
                    partial(Beta, low=low, high=high),
                ]

            inputs = list(input_space.sample())
            distr = cls(
                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
                model={},
                action_space=value_space,
                child_distributions=child_distr_cls,
                input_lens=[4, 6, 4],
            )

            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            # Sample deterministically.
            expected_det = [
                np.argmax(inputs[0], axis=-1),
                inputs[1][:, :3],  # [:3]=Mean values.
                # Mean for a Beta distribution:
                # 1 / [1 + (beta/alpha)] * range + low
                (1.0 /
                 (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2])) *
                (high - low) + low,
            ]
            out = distr.deterministic_sample()
            if sess:
                out = sess.run(out)
            check(out[0], expected_det[0])
            check(out[1], expected_det[1])
            check(out[2]["a"], expected_det[2])

            # Stochastic sampling -> expect roughly the mean.
            inputs = list(input_space.sample())
            # Fix categorical inputs (not needed for distribution itself, but
            # for our expectation calculations).
            inputs[0] = softmax(inputs[0], -1)
            # Fix std inputs (shouldn't be too large for this test).
            inputs[1][:, 3:] = std_space.sample()
            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            distr = cls(
                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
                model={},
                action_space=value_space,
                child_distributions=child_distr_cls,
                input_lens=[4, 6, 4],
            )
            expected_mean = [
                np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)),
                inputs[1][:, :3],  # [:3]=Mean values.
                # Mean for a Beta distribution:
                # 1 / [1 + (beta/alpha)] * range + low
                (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) *
                (high - low) + low,
            ]
            out = distr.sample()
            if sess:
                out = sess.run(out)
            out = list(out)
            if fw == "torch":
                out[0] = out[0].numpy()
                out[1] = out[1].numpy()
                out[2]["a"] = out[2]["a"].numpy()
            check(np.mean(out[0]), expected_mean[0], decimals=1)
            check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1)
            check(np.mean(out[2]["a"], 0),
                  np.mean(expected_mean[2], 0),
                  decimals=1)

            # Test log-likelihood outputs.
            # Make sure beta-values are within 0.0 and 1.0 for the numpy
            # calculation (which doesn't have scaling).
            inputs = list(input_space.sample())
            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            distr = cls(
                np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1),
                model={},
                action_space=value_space,
                child_distributions=child_distr_cls,
                input_lens=[4, 6, 4],
            )
            inputs[0] = softmax(inputs[0], -1)
            values = list(value_space.sample())
            log_prob_beta = np.log(
                beta.pdf(values[2]["a"], inputs[2]["a"][:, :2],
                         inputs[2]["a"][:, 2:]))
            # Now do the up-scaling for [2] (beta values) to be between
            # low/high.
            values[2]["a"] = values[2]["a"] * (high - low) + low
            inputs[1][:, 3:] = np.exp(inputs[1][:, 3:])
            expected_log_llh = np.sum(
                np.concatenate(
                    [
                        np.expand_dims(
                            np.log([
                                i[values[0][j]]
                                for j, i in enumerate(inputs[0])
                            ]),
                            -1,
                        ),
                        np.log(
                            norm.pdf(values[1], inputs[1][:, :3],
                                     inputs[1][:, 3:])),
                        log_prob_beta,
                    ],
                    -1,
                ),
                -1,
            )

            values[0] = np.expand_dims(values[0], -1)
            if fw == "torch":
                values = tree.map_structure(lambda s: torch.Tensor(s), values)
            # Test all flattened input.
            concat = np.concatenate(tree.flatten(values),
                                    -1).astype(np.float32)
            out = distr.logp(concat)
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
            # Test structured input.
            out = distr.logp(values)
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
            # Test flattened input.
            out = distr.logp(tree.flatten(values))
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
예제 #28
0
class GymEnvWrapper(gym.Env):
    """Wraps an OpenAI Gym environment to be able to modify its dimensions corresponding to MDP Playground. The documentation for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py.

    Currently supported dimensions:
        transition noise (discrete)
        reward delay
        reward noise

    Also supports wrapping with AtariPreprocessing from OpenAI Gym or wrap_deepmind from Ray Rllib.

    """

    # Should not be a gym.Wrapper because 1) gym.Wrapper has member variables observation_space and action_space while here with irrelevant_features we would have multiple observation_spaces and this could cause conflict with code that assumes any subclass of gym.Wrapper should have these member variables.
    # However, it _should_ be at least a gym.Env
    # Does it need to be a subclass of base_class because some external code
    # may check if it's an AtariEnv, for instance, and do further stuff based
    # on that?

    def __init__(self, env, **config):
        self.config = copy.deepcopy(config)
        # self.env = config["env"]
        self.env = env

        seed_int = None
        if "seed" in config:
            seed_int = config["seed"]

        self.seed(seed_int)  # seed
        # IMP Move below code from here to seed()? Because if seed is called
        # during the run of an env, the expectation is that all obs., act. space,
        # etc. seeds are set? Only Atari in Gym seems to do something similar, the
        # others I saw there don't seem to set seed for obs., act. spaces.
        self.env.seed(
            seed_int
        )  # seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env?
        obs_space_seed = self.np_random.randint(sys.maxsize)  # random
        act_space_seed = self.np_random.randint(sys.maxsize)  # random
        self.env.observation_space.seed(obs_space_seed)  # seed
        self.env.action_space.seed(act_space_seed)  # seed

        # if "dummy_eval" in config: #hack
        #     del config["dummy_eval"]
        if "delay" in config:
            self.delay = config["delay"]
            assert config["delay"] >= 0
            self.reward_buffer = [0.0] * (self.delay)
        else:
            self.delay = 0

        if "transition_noise" in config:
            self.transition_noise = config["transition_noise"]
            if config["state_space_type"] == "continuous":
                assert callable(self.transition_noise), (
                    "transition_noise must be a function when env is continuous, it was of type:"
                    + str(type(self.transition_noise)))
            else:
                assert self.transition_noise <= 1.0 and self.transition_noise >= 0.0, (
                    "transition_noise must be a value in [0.0, 1.0] when env is discrete, it was:"
                    + str(self.transition_noise))
        else:
            if config["state_space_type"] == "discrete":
                self.transition_noise = 0.0
            else:
                self.transition_noise = lambda a: 0.0

        if "reward_noise" in config:
            if callable(config["reward_noise"]):
                self.reward_noise = config["reward_noise"]
            else:
                reward_noise_std = config["reward_noise"]
                self.reward_noise = lambda a: a.normal(0, reward_noise_std)
        else:
            self.reward_noise = None

        if ("wrap_deepmind_ray" in config
                and config["wrap_deepmind_ray"]):  # hack ##TODO remove?
            self.env = wrap_deepmind(self.env, dim=42, framestack=True)
        elif "atari_preprocessing" in config and config["atari_preprocessing"]:
            self.frame_skip = 4  # default for AtariPreprocessing
            if "frame_skip" in config:
                self.frame_skip = config["frame_skip"]
            self.grayscale_obs = False
            if "grayscale_obs" in config:
                self.grayscale_obs = config["grayscale_obs"]

            # Use AtariPreprocessing with frame_skip
            # noop_max set to 1 because we want to keep the vanilla env as
            # deterministic as possible and setting it 0 was not allowed. ##TODO
            # noop_max=0 is poosible in new Gym version, so update Gym version.
            self.env = AtariPreprocessing(
                self.env,
                frame_skip=self.frame_skip,
                grayscale_obs=self.grayscale_obs,
                noop_max=1,
            )
            print("self.env.noop_max set to: ", self.env.noop_max)

        if "irrelevant_features" in config:
            # self.irrelevant_features =  config["irrelevant_features"]
            irr_toy_env_conf = config["irrelevant_features"]
            if "seed" not in irr_toy_env_conf:
                irr_toy_env_conf["seed"] = self.np_random.randint(
                    sys.maxsize)  # random

            self.irr_toy_env = RLToyEnv(**irr_toy_env_conf)

            if config["state_space_type"] == "discrete":
                self.action_space = Tuple(
                    (self.env.action_space, self.irr_toy_env.action_space))
                self.observation_space = Tuple(
                    (self.env.observation_space,
                     self.irr_toy_env.observation_space)
                )  # TODO for image observations, concatenate to 1 obs. space here and in step() and reset()?
            else:  # TODO Check the test case added for cont. irr features case and code for it in run_experiments.py.
                env_obs_low = self.env.observation_space.low
                env_obs_high = self.env.observation_space.high
                env_obs_dtype = env_obs_low.dtype
                env_obs_shape = env_obs_low.shape
                irr_env_obs_low = self.irr_toy_env.observation_space.low
                irr_env_obs_high = self.irr_toy_env.observation_space.high
                irr_env_obs_dtype = self.irr_toy_env.observation_space.low.dtype
                assert env_obs_dtype == irr_env_obs_dtype, (
                    "Datatypes of base env and irrelevant toy env should match. Were: "
                    + str(env_obs_dtype) + ", " + str(irr_env_obs_dtype))
                ext_low = np.concatenate((env_obs_low, irr_env_obs_low))
                ext_high = np.concatenate((env_obs_high, irr_env_obs_high))
                self.observation_space = Box(low=ext_low,
                                             high=ext_high,
                                             dtype=env_obs_dtype)

                env_act_low = self.env.action_space.low
                env_act_high = self.env.action_space.high
                env_act_dtype = env_act_low.dtype
                self.env_act_shape = env_act_low.shape
                assert (len(self.env_act_shape) == 1
                        ), "Length of shape of action space should be 1."
                irr_env_act_low = self.irr_toy_env.action_space.low
                irr_env_act_high = self.irr_toy_env.action_space.high
                irr_env_act_dtype = irr_env_act_low.dtype
                # assert env_obs_dtype == env_act_dtype, "Datatypes of obs. and act. of
                # base env should match. Were: " + str(env_obs_dtype) + ", " +
                # str(env_act_dtype) #TODO Apparently, observations are np.float64 and
                # actions np.float32 for Mujoco.
                ext_low = np.concatenate((env_act_low, irr_env_act_low))
                ext_high = np.concatenate((env_act_high, irr_env_act_high))
                self.action_space = Box(
                    low=ext_low, high=ext_high, dtype=env_act_dtype
                )  # TODO Use BoxExtended here and above?

            self.observation_space.seed(obs_space_seed)  # seed
            self.action_space.seed(act_space_seed)  # seed
        else:
            self.action_space = self.env.action_space
            self.observation_space = self.env.observation_space

        self.total_episodes = 0

        # if "action_loss_weight" in config: #hack
        #     del config["action_loss_weight"]
        # if "action_space_max" in config: #hack
        #     action_space_max = config["action_space_max"]
        #     del config["action_space_max"]
        # if "time_unit" in config: #hack
        #     time_unit = config["time_unit"]
        #     del config["time_unit"]
        # if "dummy_seed" in config: #hack
        #     del config["dummy_seed"]

        super(GymEnvWrapper, self).__init__()
        # if "action_space_max" in locals():
        #     print("Setting Mujoco self.action_space.low, self.action_space.high from:", self.action_space.low, self.action_space.high)
        #     self.action_space.low *= action_space_max
        #     self.action_space.high *= action_space_max
        #     print("to:", self.action_space.low, self.action_space.high)

        # if base_class == HalfCheetahEnv and action_space_max >= 4: #hack
        #     self.model.opt.timestep /= 2 # 0.005
        #     self.frame_skip *= 2
        #     print("Setting Mujoco timestep to", self.model.opt.timestep, "half of the usual to avoid instabilities. At the same time action repeat increased to twice its usual.")

        # if "time_unit" in locals(): #hack In HalfCheetah, this is needed because the reward function is dependent on the time_unit because it depends on velocity achieved which depends on amount of time torque was applied. In Pusher, Reacher, it is also needed because the reward is similar to the distance from current position to goal at _each_ step, which means if we calculate the reward multiple times in the same amount of "real" time, we'd need to average out the reward the more times we calculate the reward in the same amount of "real" time (i.e., when we have shorter acting timesteps). This is not the case with the toy enviroments because there the reward is amount of distance moved from current position to goal in the current timestep, so it's dependent on "real" time and not on acting timesteps.
        # self.frame_skip *= time_unit
        # self.frame_skip = int(self.frame_skip)
        # self._ctrl_cost_weight *= time_unit
        # self._forward_reward_weight *= time_unit
        # print("Setting Mujoco self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight to", self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight, "corresponding to time_unit in config.")

    def step(self, action):
        # next_state, reward, done, info = super(GymEnvWrapper, self).step(action)
        self.total_transitions_episode += 1

        if (self.config["state_space_type"] == "discrete"
                and self.transition_noise > 0.0):
            probs = (np.ones(shape=(self.env.action_space.n, )) *
                     self.transition_noise / (self.env.action_space.n - 1))
            probs[action] = 1 - self.transition_noise
            old_action = action
            action = int(
                self.np_random.choice(self.env.action_space.n, size=1,
                                      p=probs))  # random
            if old_action != action:
                # print("NOISE inserted", old_action, action)
                self.total_noisy_transitions_episode += 1
        else:  # cont. envs
            pass  # TODO
            # self.total_abs_noise_in_transition_episode += np.abs(noise_in_transition)

        if "irrelevant_features" in self.config:
            if self.config["state_space_type"] == "discrete":
                next_state, reward, done, info = self.env.step(action[0])
                next_state_irr, _, done_irr, _ = self.irr_toy_env.step(
                    action[1])
                next_state = tuple([next_state, next_state_irr])
            else:
                next_state, reward, done, info = self.env.step(
                    action[:self.env_act_shape[0]])
                next_state_irr, _, done_irr, _ = self.irr_toy_env.step(
                    action[self.env_act_shape[0]:])
                next_state = np.concatenate((next_state, next_state_irr))
        else:
            next_state, reward, done, info = self.env.step(action)

        if done:
            # if episode is finished return the rewards that were delayed and not
            # handed out before ##TODO add test case for this
            reward = np.sum(self.reward_buffer)
        else:
            self.reward_buffer.append(reward)
            old_reward = reward
            reward = self.reward_buffer[0]
            # print("rewards:", self.reward_buffer, old_reward, reward)
            del self.reward_buffer[0]

        # random ###TODO Would be better to parameterise this in terms of state,
        # action and time_step as well. Would need to change implementation to
        # have a queue for the rewards achieved and then pick the reward that was
        # generated delay timesteps ago.
        noise_in_reward = (self.reward_noise(self.np_random)
                           if self.reward_noise else 0)
        self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward)
        self.total_reward_episode += reward
        reward += noise_in_reward

        return next_state, reward, done, info

    def reset(self):
        # on episode "end" stuff (to not be invoked when reset() called when
        # self.total_episodes = 0; end is in quotes because it may not be a true
        # episode end reached by reaching a terminal state, but reset() may have
        # been called in the middle of an episode):
        if not self.total_episodes == 0:
            print(
                "Noise stats for previous episode num.: " +
                str(self.total_episodes) +
                " (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): "
                + str(self.total_abs_noise_in_reward_episode) + " " +
                str(self.total_abs_noise_in_transition_episode) + " " +
                str(self.total_reward_episode) + " " +
                str(self.total_noisy_transitions_episode) + " " +
                str(self.total_transitions_episode))

        # on episode start stuff:
        self.reward_buffer = [0.0] * (self.delay)

        self.total_episodes += 1

        self.total_abs_noise_in_reward_episode = 0
        self.total_abs_noise_in_transition_episode = (
            0  # only present in continuous spaces
        )
        self.total_noisy_transitions_episode = 0  # only present in discrete spaces
        self.total_reward_episode = 0
        self.total_transitions_episode = 0

        if "irrelevant_features" in self.config:
            if self.config["state_space_type"] == "discrete":
                reset_state = self.env.reset()
                reset_state_irr = self.irr_toy_env.reset()
                reset_state = tuple([reset_state, reset_state_irr])
            else:
                reset_state = self.env.reset()
                reset_state_irr = self.irr_toy_env.reset()
                reset_state = np.concatenate((reset_state, reset_state_irr))
        else:
            reset_state = self.env.reset()
        return reset_state
        # return super(GymEnvWrapper, self).reset()

    def seed(self, seed=None):
        """Initialises the Numpy RNG for the environment by calling a utility for this in Gym.

        Parameters
        ----------
        seed : int
            seed to initialise the np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it.

        Returns
        -------
        int
            The seed returned by Gym
        """
        # If seed is None, you get a randomly generated seed from gym.utils...
        self.np_random, self.seed_ = gym.utils.seeding.np_random(
            seed)  # random
        print("Env SEED set to: " + str(seed) + ". Returned seed from Gym: " +
              str(self.seed_))

        return self.seed_
예제 #29
0
class Imagination:
    def __init__(self, model, n_actors, horizon, measure):
        """
        Imaginary MDP

        Args:
            model: models.Model object
            n_actors: number of parallel episodes
            horizon: length of the episode
            measure: the reward function
        """

        self.model = model
        self.n_actors = n_actors
        self.horizon = horizon
        self.measure = measure
        self.ensemble_size = model.ensemble_size

        self.action_space = Box(low=-1.0, high=1.0, shape=(n_actors, self.model.d_action), dtype=np.float32)
        self.action_space.seed(np.random.randint(np.iinfo(np.uint32).max))

        self.init_state = None
        self.states = None
        self.steps = None

    def step(self, actions):
        n_act = self.n_actors
        es = self.ensemble_size

        actions = actions.to(self.model.device)

        # get next state distribution for all models
        with torch.no_grad():
            next_state_means, next_state_vars = self.model.forward_all(self.states, actions)    # shape: (n_actors, ensemble_size, d_state)

        i = torch.arange(n_act).to(self.model.device)
        j = torch.randint(es, size=(n_act,)).to(self.model.device)
        next_states = self.model.sample(next_state_means[i, j], next_state_vars[i, j])          # shape: (n_actors, d_state)
        #print (next_state_vars[i, j])

        if torch.any(torch.isnan(next_states)).item():
            warnings.warn("NaN in sampled next states!")

        if torch.any(torch.isinf(next_states)).item():
            warnings.warn("Inf in sampled next states!")

        # compute measure
        measures = self.measure(self.states,                                         # shape: (n_actors, d_state)
                                actions,                                             # shape: (n_actors, d_action)
                                next_states,                                         # shape: (n_actors, d_state)
                                next_state_means,                                    # shape: (n_actors, ensemble_size, d_state)
                                next_state_vars,                                     # shape: (n_actors, ensemble_size, d_state)
                                self.model)

        self.states = next_states
        self.steps += 1
        done = False
        if self.steps >= self.horizon:
            done = True

        return next_states, measures, done, {}

    def reset(self):
        states = torch.from_numpy(self.init_state).float()
        states = states.unsqueeze(0)
        states = states.repeat(self.n_actors, 1)
        states = states.to(self.model.device)
        self.steps = 0
        self.states = states                    # shape: (n_actors, d_state)
        return states

    def update_init_state(self, state):
        self.init_state = state
예제 #30
0
def test_forward_multiple() -> None:
    """
    Test forward() when none of the layers are fully shared. The function computed by
    the network should be:
    - f(x) = 3 * tanh(2 * tanh(x + 1) + 2) + 3 for task 0
    - f(x) = -3 * tanh(-2 * tanh(x + 1) - 2) - 3 for task 1
    - f(x) = -3 * tanh(1/2 * tanh(-x - 1) + 1/2) - 3 for task 2
    - f(x) = 3 * tanh(-2 * tanh(-x - 1) - 2) + 3 for task 3
    """

    # Set up case.
    dim = BASE_SETTINGS["obs_dim"] + BASE_SETTINGS["num_tasks"]
    observation_subspace = Box(low=-np.inf,
                               high=np.inf,
                               shape=(BASE_SETTINGS["obs_dim"], ))
    observation_subspace.seed(DEFAULT_SETTINGS["seed"])
    hidden_size = dim

    # Construct network.
    network = BaseMultiTaskSplittingNetwork(
        input_size=dim,
        output_size=dim,
        num_tasks=BASE_SETTINGS["num_tasks"],
        num_layers=BASE_SETTINGS["num_layers"],
        hidden_size=hidden_size,
        device=BASE_SETTINGS["device"],
    )

    # Split the network at the second layer. Tasks 0 and 1 stay assigned to the original
    # copy and tasks 2 and 3 are assigned to the new copy.
    network.split(0, 0, [0, 1], [2, 3])
    network.split(1, 0, [0, 2], [1, 3])
    network.split(1, 0, [0], [2])
    network.split(2, 0, [0, 3], [1, 2])

    # Set network weights.
    state_dict = network.state_dict()
    for i in range(BASE_SETTINGS["num_layers"]):
        for j in range(3):
            weight_name = "regions.%d.%d.0.weight" % (i, j)
            bias_name = "regions.%d.%d.0.bias" % (i, j)
            if weight_name not in state_dict:
                continue

            if j == 0:
                state_dict[weight_name] = torch.Tensor(
                    (i + 1) * np.identity(dim))
                state_dict[bias_name] = torch.Tensor((i + 1) * np.ones(dim))
            elif j == 1:
                state_dict[weight_name] = torch.Tensor(-(i + 1) *
                                                       np.identity(dim))
                state_dict[bias_name] = torch.Tensor(-(i + 1) * np.ones(dim))
            elif j == 2:
                state_dict[weight_name] = torch.Tensor(1 / (i + 1) *
                                                       np.identity(dim))
                state_dict[bias_name] = torch.Tensor(1 / (i + 1) *
                                                     np.ones(dim))
            else:
                raise NotImplementedError

    network.load_state_dict(state_dict)

    # Construct batch of observations concatenated with one-hot task vectors.
    obs, task_indices = get_obs_batch(
        batch_size=BASE_SETTINGS["num_processes"],
        obs_space=observation_subspace,
        num_tasks=BASE_SETTINGS["num_tasks"],
    )

    # Get output of network.
    output = network(obs, task_indices)

    # Computed expected output of network.
    expected_output = torch.zeros(obs.shape)
    for i, (ob, task) in enumerate(zip(obs, task_indices)):
        if task == 0:
            expected_output[i] = 3 * torch.tanh(2 * torch.tanh(ob + 1) + 2) + 3
        elif task == 1:
            expected_output[i] = -3 * torch.tanh(-2 * torch.tanh(ob + 1) -
                                                 2) - 3
        elif task == 2:
            expected_output[i] = (
                -3 * torch.tanh(1 / 2 * torch.tanh(-ob - 1) + 1 / 2) - 3)
        elif task == 3:
            expected_output[i] = 3 * torch.tanh(-2 * torch.tanh(-ob - 1) -
                                                2) + 3
        else:
            raise NotImplementedError

    # Test output of network.
    assert torch.allclose(output, expected_output)