예제 #1
0
def test_linear_approximator():
    np.random.seed(88)

    noise = 1e-3

    a = np.random.rand(1000, 3)

    k = np.random.rand(3, 2)
    b = a.dot(k) + np.random.randn(1000, 2) * noise

    approximator = Regressor(LinearApproximator,
                             input_shape=(3, ),
                             output_shape=(2, ))

    approximator.fit(a, b)

    khat = approximator.get_weights()

    deltaK = (khat - k.T.flatten())

    assert np.linalg.norm(deltaK) < noise

    point = np.random.randn(3, )
    derivative = approximator.diff(point)

    lp = len(point)
    for i in range(derivative.shape[1]):
        assert (derivative[i * lp:(i + 1) * lp, i] == point).all()
예제 #2
0
def test_pytorch_approximator():
    np.random.seed(88)
    torch.manual_seed(88)

    noise = 1e-3**2

    a = np.random.rand(1000, 4)

    k = np.random.rand(4, 2)
    b = np.sin(a).dot(k) + np.random.randn(1000, 2) * noise

    approximator = Regressor(PyTorchApproximator,
                             input_shape=(4, ),
                             output_shape=(2, ),
                             network=ExampleNet,
                             optimizer={
                                 'class': optim.Adam,
                                 'params': {}
                             },
                             loss=F.mse_loss,
                             n_neurons=100,
                             n_hidden=1,
                             n_epochs=200,
                             batch_size=100,
                             quiet=True)

    approximator.fit(a, b)

    bhat = approximator.predict(a)
    error = np.linalg.norm(b - bhat, 'fro') / 1000
    error_inf = np.max(np.abs(b - bhat))

    print(b[:10])

    print(bhat[:10])

    print(error_inf)

    assert error < 2e-4

    gradient = approximator.diff(a[0])
    assert gradient.shape[1] == 2

    old_weights = approximator.get_weights()
    approximator.set_weights(old_weights)
    new_weights = approximator.get_weights()

    assert np.array_equal(new_weights, old_weights)

    random_weights = np.random.randn(*old_weights.shape).astype(np.float32)
    approximator.set_weights(random_weights)
    random_weight_new = approximator.get_weights()

    assert np.array_equal(random_weights, random_weight_new)
    assert not np.any(np.equal(random_weights, old_weights))

    bhat_random = approximator.predict(a)

    assert not np.array_equal(bhat, bhat_random)
예제 #3
0
def test_pytorch_approximator():
    np.random.seed(1)
    torch.manual_seed(1)

    n_actions = 2
    s = np.random.rand(1000, 4)
    a = np.random.randint(n_actions, size=(1000, 1))
    q = np.random.rand(1000)

    approximator = Regressor(TorchApproximator, input_shape=(4,),
                             output_shape=(2,), n_actions=n_actions,
                             network=ExampleNet,
                             optimizer={'class': optim.Adam,
                                        'params': {}}, loss=F.mse_loss,
                             batch_size=100, quiet=True)

    approximator.fit(s, a, q, n_epochs=20)

    x_s = np.random.rand(2, 4)
    x_a = np.random.randint(n_actions, size=(2, 1))
    y = approximator.predict(x_s, x_a)
    y_test = np.array([0.37191153, 0.5920861])

    assert np.allclose(y, y_test)

    y = approximator.predict(x_s)
    y_test = np.array([[0.47908658, 0.37191153],
                       [0.5920861, 0.27575058]])

    assert np.allclose(y, y_test)

    gradient = approximator.diff(x_s[0], x_a[0])
    gradient_test = np.array([0., 0., 0., 0., 0.02627479, 0.76513696,
                              0.6672573, 0.35979462, 0., 1.])
    assert np.allclose(gradient, gradient_test)

    gradient = approximator.diff(x_s[0])
    gradient_test = np.array([[0.02627479, 0.], [0.76513696, 0.],
                              [0.6672573, 0.], [0.35979462, 0.],
                              [0., 0.02627479], [0., 0.76513696],
                              [0., 0.6672573], [0., 0.35979462], [1, 0.],
                              [0., 1.]])
    assert np.allclose(gradient, gradient_test)

    old_weights = approximator.get_weights()
    approximator.set_weights(old_weights)
    new_weights = approximator.get_weights()

    assert np.array_equal(new_weights, old_weights)

    random_weights = np.random.randn(*old_weights.shape).astype(np.float32)
    approximator.set_weights(random_weights)
    random_weight_new = approximator.get_weights()

    assert np.array_equal(random_weights, random_weight_new)
    assert not np.any(np.equal(random_weights, old_weights))
예제 #4
0
def test_pytorch_approximator():
    np.random.seed(88)
    torch.manual_seed(88)

    noise = 1e-3**2

    a = np.random.rand(1000, 4)

    k = np.random.rand(4, 2)
    b = np.sin(a).dot(k) + np.random.randn(1000, 2) * noise

    approximator = Regressor(PyTorchApproximator,
                             input_shape=(4, ),
                             output_shape=(2, ),
                             network=ExampleNet,
                             optimizer={
                                 'class': optim.Adam,
                                 'params': {}
                             },
                             loss=F.mse_loss,
                             n_neurons=100,
                             n_hidden=1,
                             n_epochs=200,
                             batch_size=100,
                             quiet=True)

    approximator.fit(a, b)

    bhat = approximator.predict(a)
    error = np.linalg.norm(b - bhat, 'fro') / 1000
    error_inf = np.max(np.abs(b - bhat))

    print(b[:10])

    print(bhat[:10])

    print(error_inf)

    assert error < 2e-4
예제 #5
0
파일: dqn.py 프로젝트: ronald-xie/mushroom
class DQN(Agent):
    """
    Deep Q-Network algorithm.
    "Human-Level Control Through Deep Reinforcement Learning".
    Mnih V. et al.. 2015.

    """
    def __init__(self, approximator, policy, mdp_info, batch_size,
                 initial_replay_size, max_replay_size,
                 target_update_frequency=2500, fit_params=None,
                 approximator_params=None, n_approximators=1,
                 history_length=1, clip_reward=True, max_no_op_actions=0,
                 no_op_action_value=0, dtype=np.float32):
        """
        Constructor.

        Args:
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            target_update_frequency (int): the number of samples collected
                between each update of the target network;
            fit_params (dict, None): parameters of the fitting algorithm of the
                approximator;
            approximator_params (dict, None): parameters of the approximator to
                build;
            n_approximators (int, 1): the number of approximator to use in
                ``AverageDQN``;
            history_length (int, 1): the number of samples composing a state;
            clip_reward (bool, True): whether to clip the reward or not;
            max_no_op_actions (int, 0): maximum number of no-op actions that
                can be sampled;
            no_op_action_value (int, 0): value of the no-op action;
            dtype (object, np.float32): dtype of the state array.

        """
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency
        self._max_no_op_actions = max_no_op_actions
        self._no_op_action_value = no_op_action_value

        self._replay_memory = ReplayMemory(mdp_info, initial_replay_size,
                                           max_replay_size, history_length,
                                           dtype)
        self._buffer = Buffer(history_length, dtype)

        self._n_updates = 0
        self._episode_steps = 0
        self._no_op_actions = None

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             n_models=self._n_approximators,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        if self._n_approximators == 1:
            self.target_approximator.model.set_weights(
                self.approximator.model.get_weights())
        else:
            for i in range(self._n_approximators):
                self.target_approximator.model[i].set_weights(
                    self.approximator.model.get_weights())

        super(DQN, self).__init__(policy, mdp_info)

    def fit(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ =\
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self.approximator.fit(state, action, q, **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.

        Returns:
            Maximum action-value for each state in ``next_state``.

        """
        q = self.target_approximator.predict(next_state)
        if np.any(absorbing):
            q *= 1 - absorbing.reshape(-1, 1)

        return np.max(q, axis=1)

    def draw_action(self, state):
        self._buffer.add(state)

        if self._episode_steps < self._no_op_actions:
            action = np.array([self._no_op_action_value])
            self.policy.update()
        else:
            extended_state = self._buffer.get()

            action = super(DQN, self).draw_action(extended_state)

        self._episode_steps += 1

        return action

    def episode_start(self):
        if self._max_no_op_actions == 0:
            self._no_op_actions = 0
        else:
            self._no_op_actions = np.random.randint(
                self._buffer.size, self._max_no_op_actions + 1)
        self._episode_steps = 0
예제 #6
0
class DQN(Agent):
    """
    Deep Q-Network algorithm.
    "Human-Level Control Through Deep Reinforcement Learning".
    Mnih V. et al.. 2015.

    """
    def __init__(self, approximator, policy, mdp_info, batch_size,
                 initial_replay_size, max_replay_size,
                 target_update_frequency=2500, fit_params=None,
                 approximator_params=None, n_approximators=1,
                 history_length=1, clip_reward=True, max_no_op_actions=0,
                 no_op_action_value=0, dtype=np.float32):
        """
        Constructor.

        Args:
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            target_update_frequency (int): the number of samples collected
                between each update of the target network;
            fit_params (dict, None): parameters of the fitting algorithm of the
                approximator;
            approximator_params (dict, None): parameters of the approximator to
                build;
            n_approximators (int, 1): the number of approximator to use in
                ``AverageDQN``;
            history_length (int, 1): the number of samples composing a state;
            clip_reward (bool, True): whether to clip the reward or not;
            max_no_op_actions (int, 0): maximum number of no-op actions that
                can be sampled;
            no_op_action_value (int, 0): value of the no-op action;
            dtype (object, np.float32): dtype of the state array.

        """
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency
        self._max_no_op_actions = max_no_op_actions
        self._no_op_action_value = no_op_action_value

        self._replay_memory = ReplayMemory(mdp_info, initial_replay_size,
                                           max_replay_size, history_length,
                                           dtype)
        self._buffer = Buffer(history_length, dtype)

        self._n_updates = 0
        self._episode_steps = 0
        self._no_op_actions = None

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_target = deepcopy(approximator_params)
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             n_models=self._n_approximators,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        if self._n_approximators == 1:
            self.target_approximator.model.set_weights(
                self.approximator.model.get_weights())
        else:
            for i in range(self._n_approximators):
                self.target_approximator.model[i].set_weights(
                    self.approximator.model.get_weights())

        super().__init__(policy, mdp_info)

    def fit(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ =\
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self.approximator.fit(state, action, q, **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.

        Returns:
            Maximum action-value for each state in ``next_state``.

        """
        q = self.target_approximator.predict(next_state)
        if np.any(absorbing):
            q *= 1 - absorbing.reshape(-1, 1)

        return np.max(q, axis=1)

    def draw_action(self, state):
        self._buffer.add(state)

        if self._episode_steps < self._no_op_actions:
            action = np.array([self._no_op_action_value])
            self.policy.update()
        else:
            extended_state = self._buffer.get()

            action = super(DQN, self).draw_action(extended_state)

        self._episode_steps += 1

        return action

    def episode_start(self):
        if self._max_no_op_actions == 0:
            self._no_op_actions = 0
        else:
            self._no_op_actions = np.random.randint(
                self._buffer.size, self._max_no_op_actions + 1)
        self._episode_steps = 0
예제 #7
0
파일: dqn.py 프로젝트: svestark/mushroom
class DQN(Agent):
    """
    Deep Q-Network algorithm.
    "Human-Level Control Through Deep Reinforcement Learning".
    Mnih V. et al.. 2015.

    """
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 batch_size,
                 approximator_params,
                 target_update_frequency,
                 replay_memory=None,
                 initial_replay_size=500,
                 max_replay_size=5000,
                 fit_params=None,
                 n_approximators=1,
                 clip_reward=True):
        """
        Constructor.

        Args:
            approximator (object): the approximator to use to fit the
               Q-function;
            batch_size (int): the number of samples in a batch;
            approximator_params (dict): parameters of the approximator to
                build;
            target_update_frequency (int): the number of samples collected
                between each update of the target network;
            replay_memory ([ReplayMemory, PrioritizedReplayMemory], None): the
                object of the replay memory to use; if None, a default replay
                memory is created;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            fit_params (dict, None): parameters of the fitting algorithm of the
                approximator;
            n_approximators (int, 1): the number of approximator to use in
                ``AverageDQN``;
            clip_reward (bool, True): whether to clip the reward or not.

        """
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency

        if replay_memory is not None:
            self._replay_memory = replay_memory
            if isinstance(replay_memory, PrioritizedReplayMemory):
                self._fit = self._fit_prioritized
            else:
                self._fit = self._fit_standard
        else:
            self._replay_memory = ReplayMemory(initial_replay_size,
                                               max_replay_size)
            self._fit = self._fit_standard

        self._n_updates = 0

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_target = deepcopy(approximator_params)
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             n_models=self._n_approximators,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        if self._n_approximators == 1:
            self.target_approximator.model.set_weights(
                self.approximator.model.get_weights())
        else:
            for i in range(self._n_approximators):
                self.target_approximator.model[i].set_weights(
                    self.approximator.model.get_weights())

        super().__init__(policy, mdp_info)

    def fit(self, dataset):
        self._fit(dataset)

        self._n_updates += 1
        if self._n_updates % self._target_update_frequency == 0:
            self._update_target()

    def _fit_standard(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ = \
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self.approximator.fit(state, action, q, **self._fit_params)

    def _fit_prioritized(self, dataset):
        self._replay_memory.add(
            dataset,
            np.ones(len(dataset)) * self._replay_memory.max_priority)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, idxs, is_weight = \
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next
            td_error = q - self.approximator.predict(state, action)

            self._replay_memory.update(td_error, idxs)

            self.approximator.fit(state,
                                  action,
                                  q,
                                  weights=is_weight,
                                  **self._fit_params)

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.

        Returns:
            Maximum action-value for each state in ``next_state``.

        """
        q = self.target_approximator.predict(next_state)
        if np.any(absorbing):
            q *= 1 - absorbing.reshape(-1, 1)

        return np.max(q, axis=1)

    def draw_action(self, state):
        action = super(DQN, self).draw_action(np.array(state))

        return action
예제 #8
0
class DQN(Agent):
    """
    Deep Q-Network algorithm.
    "Human-Level Control Through Deep Reinforcement Learning".
    Mnih V. et al.. 2015.

    """
    def __init__(self, approximator, policy, mdp_info, params):
        alg_params = params['algorithm_params']
        self._batch_size = alg_params.get('batch_size')
        self._n_approximators = alg_params.get('n_approximators', 1)
        self._clip_reward = alg_params.get('clip_reward', True)
        self._train_frequency = alg_params.get('train_frequency')
        self._target_update_frequency = alg_params.get(
            'target_update_frequency')
        self._max_no_op_actions = alg_params.get('max_no_op_actions', 0)
        self._no_op_action_value = alg_params.get('no_op_action_value', 0)

        self._replay_memory = ReplayMemory(
            mdp_info, alg_params.get('initial_replay_size'),
            alg_params.get('max_replay_size'),
            alg_params.get('history_length', 1))
        self._buffer = Buffer(size=alg_params.get('history_length', 1))

        self._n_updates = 0
        self._episode_steps = 0
        self._no_op_actions = None

        apprx_params_train = deepcopy(params['approximator_params'])
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(params['approximator_params'])
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             n_models=self._n_approximators,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        if self._n_approximators == 1:
            self.target_approximator.model.set_weights(
                self.approximator.model.get_weights())
        else:
            for i in xrange(self._n_approximators):
                self.target_approximator.model[i].set_weights(
                    self.approximator.model.get_weights())

        super(DQN, self).__init__(policy, mdp_info, params)

    def fit(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ =\
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self.approximator.fit(state, action, q,
                                  **self.params['fit_params'])

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.

        Returns:
            Maximum action-value for each state in ``next_state``.

        """
        q = self.target_approximator.predict(next_state)
        if np.any(absorbing):
            q *= 1 - absorbing.reshape(-1, 1)

        return np.max(q, axis=1)

    def draw_action(self, state):
        self._buffer.add(state)

        if self._episode_steps < self._no_op_actions:
            action = np.array([self._no_op_action_value])
            self.policy.update()
        else:
            extended_state = self._buffer.get()

            action = super(DQN, self).draw_action(extended_state)

        self._episode_steps += 1

        return action

    def episode_start(self):
        self._no_op_actions = np.random.randint(self._buffer.size,
                                                self._max_no_op_actions + 1)
        self._episode_steps = 0
예제 #9
0
class DQN(Agent):
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 batch_size,
                 target_update_frequency,
                 initial_replay_size,
                 train_frequency,
                 max_replay_size,
                 fit_params=None,
                 approximator_params=None,
                 n_approximators=1,
                 history_length=1,
                 clip_reward=True,
                 max_no_op_actions=0,
                 no_op_action_value=0,
                 p_mask=2 / 3.,
                 dtype=np.float32,
                 weighted_update=False):
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency // train_frequency
        self._max_no_op_actions = max_no_op_actions
        self._no_op_action_value = no_op_action_value
        self._p_mask = p_mask
        self.weighted_update = weighted_update
        self._replay_memory = ReplayMemory(mdp_info, initial_replay_size,
                                           max_replay_size, history_length,
                                           n_approximators, dtype)
        self._buffer = Buffer(history_length, dtype)

        self._n_updates = 0
        self._episode_steps = 0
        self._no_op_actions = None

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

        super(DQN, self).__init__(policy, mdp_info)

    def fit(self, dataset):

        mask = np.random.binomial(1,
                                  self._p_mask,
                                  size=(len(dataset), self._n_approximators))
        self._replay_memory.add(dataset, mask)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, mask =\
                self._replay_memory.get(self._batch_size)

            q = np.array(self.approximator.predict(state))[0]
            q = q.reshape((self._n_approximators * self._batch_size, -1))
            q = q[np.arange(self._n_approximators * self._batch_size),
                  np.tile(action.ravel(), self._n_approximators)]
            q = q.reshape((self._n_approximators, self._batch_size)).T

            idxs = q.argsort()

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q_next_ordered = np.sort(q_next)
            #order target values to match the source values
            for i in range(idxs.shape[0]):
                q_next[i, idxs[i]] = q_next_ordered[i]

            q = reward.reshape(self._batch_size,
                               1) + self.mdp_info.gamma * q_next
            self.approximator.fit(state,
                                  action,
                                  q,
                                  mask=mask,
                                  **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                `next_state`.

        Returns:
            Maximum action-value for each state in `next_state`.

        """
        q = np.array(self.target_approximator.predict(next_state))[0]
        for i in range(q.shape[1]):
            if absorbing[i]:
                q[:, i, :] *= 1. - absorbing[i]

        if not self.weighted_update:
            #find best actions
            best_actions = np.argmax(np.mean(q, axis=0), axis=1)
            max_q = np.zeros((q.shape[1], q.shape[0]))
            for i in range(q.shape[1]):
                max_q[i, :] = q[:, i, best_actions[i]]
            return max_q
        else:
            N = q.shape[0]
            num_actions = q.shape[2]
            batch_size = q.shape[1]
            probs = np.zeros((batch_size, num_actions))
            weights = 1 / N
            #calculate probability of being maximum
            for b in range(batch_size):
                for i in range(num_actions):
                    particles = q[:, b, i]
                    p = 0
                    for k in range(N):
                        p2 = 1
                        p_k = particles[k]
                        for j in range(num_actions):
                            if (j != i):
                                particles2 = q[:, b, j]
                                p3 = 0
                                for l in range(N):
                                    if particles2[l] <= p_k:
                                        p3 += weights
                                p2 *= p3
                        p += weights * p2
                    probs[b, i] = p
            max_q = np.zeros((batch_size, N))
            for i in range(batch_size):
                particles = np.zeros(N)
                for j in range(num_actions):
                    particles += q[:, i, j] * probs[i, j]
                max_q[i, :] = particles
            return max_q

    def draw_action(self, state):
        self._buffer.add(state)

        if self._episode_steps < self._no_op_actions:
            action = np.array([self._no_op_action_value])
            self.policy.update_epsilon(state)
        else:
            extended_state = self._buffer.get()

            action = super(DQN, self).draw_action(extended_state)

        self._episode_steps += 1
        return action

    def episode_start(self):
        if self._max_no_op_actions == 0:
            self._no_op_actions = 0
        else:
            self._no_op_actions = np.random.randint(
                self._buffer.size, self._max_no_op_actions + 1)
        self._episode_steps = 0
        self.policy.set_idx(np.random.randint(self._n_approximators))
예제 #10
0
def test_linear_approximator():
    np.random.seed(1)

    # Generic regressor
    a = np.random.rand(1000, 3)

    k = np.random.rand(3, 2)
    b = a.dot(k) + np.random.randn(1000, 2)

    approximator = Regressor(LinearApproximator,
                             input_shape=(3, ),
                             output_shape=(2, ))

    approximator.fit(a, b)

    x = np.random.rand(2, 3)
    y = approximator.predict(x)
    y_test = np.array([[0.57638247, 0.1573216], [0.11388247, 0.24123678]])

    assert np.allclose(y, y_test)

    point = np.random.randn(3, )
    derivative = approximator.diff(point)

    lp = len(point)
    for i in range(derivative.shape[1]):
        assert (derivative[i * lp:(i + 1) * lp, i] == point).all()

    old_weights = approximator.get_weights()
    approximator.set_weights(old_weights)
    new_weights = approximator.get_weights()

    assert np.array_equal(new_weights, old_weights)

    random_weights = np.random.randn(*old_weights.shape).astype(np.float32)
    approximator.set_weights(random_weights)
    random_weight_new = approximator.get_weights()

    assert np.array_equal(random_weights, random_weight_new)
    assert not np.any(np.equal(random_weights, old_weights))

    # Action regressor + Ensemble
    n_actions = 2
    s = np.random.rand(1000, 3)
    a = np.random.randint(n_actions, size=(1000, 1))
    q = np.random.rand(1000)

    approximator = Regressor(LinearApproximator,
                             input_shape=(3, ),
                             n_actions=n_actions,
                             n_models=5)

    approximator.fit(s, a, q)

    x_s = np.random.rand(2, 3)
    x_a = np.random.randint(n_actions, size=(2, 1))
    y = approximator.predict(x_s, x_a, prediction='mean')
    y_test = np.array([0.49225698, 0.69660881])
    assert np.allclose(y, y_test)

    y = approximator.predict(x_s, x_a, prediction='sum')
    y_test = np.array([2.46128492, 3.48304404])
    assert np.allclose(y, y_test)

    y = approximator.predict(x_s, x_a, prediction='min')
    y_test = np.array([[0.49225698, 0.69660881]])
    assert np.allclose(y, y_test)

    y = approximator.predict(x_s)
    y_test = np.array([[0.49225698, 0.44154141], [0.69660881, 0.69060195]])
    assert np.allclose(y, y_test)

    approximator = Regressor(LinearApproximator,
                             input_shape=(3, ),
                             n_actions=n_actions)

    approximator.fit(s, a, q)

    gradient = approximator.diff(x_s[0], x_a[0])
    gradient_test = np.array([0.88471362, 0.11666548, 0.45466254, 0., 0., 0.])

    assert np.allclose(gradient, gradient_test)
예제 #11
0
class GaussianDQN(Agent):
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 batch_size,
                 target_update_frequency,
                 initial_replay_size,
                 max_replay_size,
                 fit_params=None,
                 approximator_params=None,
                 clip_reward=True,
                 update_type='weighted',
                 delta=0.1,
                 store_prob=False,
                 q_max=100,
                 max_spread=None):
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency
        self.update_type = update_type
        self.delta = delta
        self.standard_bound = norm.ppf(1 - self.delta, loc=0, scale=1)
        self.store_prob = store_prob
        self.q_max = q_max
        self.max_spread = max_spread
        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        self._n_updates = 0
        self._epsilon = 1e-7
        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

        super(GaussianDQN, self).__init__(policy, mdp_info)

    @staticmethod
    def _compute_prob_max(mean_list, sigma_list):
        n_actions = len(mean_list)
        lower_limit = mean_list - 8 * sigma_list
        upper_limit = mean_list + 8 * sigma_list
        epsilon = 1e2
        n_trapz = 100
        x = np.zeros(shape=(n_trapz, n_actions))
        y = np.zeros(shape=(n_trapz, n_actions))
        integrals = np.zeros(n_actions)
        for j in range(n_actions):
            if sigma_list[j] < epsilon:
                p = 1
                for k in range(n_actions):
                    if k != j:
                        p *= norm.cdf(mean_list[j],
                                      loc=mean_list[k],
                                      scale=sigma_list[k])
                integrals[j] = p
            else:
                x[:, j] = np.linspace(lower_limit[j], upper_limit[j], n_trapz)
                y[:, j] = norm.pdf(x[:, j],
                                   loc=mean_list[j],
                                   scale=sigma_list[j])
                for k in range(n_actions):
                    if k != j:
                        y[:, j] *= norm.cdf(x[:, j],
                                            loc=mean_list[k],
                                            scale=sigma_list[k])
                integrals[j] = (upper_limit[j] - lower_limit[j]) / (
                    2 * (n_trapz - 1)) * (y[0, j] + y[-1, j] +
                                          2 * np.sum(y[1:-1, j]))

        # print(np.sum(integrals))
        # assert np.isclose(np.sum(integrals), 1)
        with np.errstate(divide='raise'):
            try:
                return integrals / np.sum(integrals)
            except FloatingPointError:
                print(integrals)
                print(mean_list)
                print(sigma_list)
                input()

    def fit(self, dataset):
        mask = np.ones((len(dataset), 2))
        self._replay_memory.add(dataset, mask)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, mask = \
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next, sigma_next, prob_explore = self._next_q(
                next_state, absorbing)

            q = reward + self.mdp_info.gamma * q_next
            sigma = self.mdp_info.gamma * sigma_next
            stacked = np.stack([q, sigma])

            self.approximator.fit(state,
                                  action,
                                  stacked,
                                  prob_exploration=prob_explore,
                                  **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                `next_state`.

        Returns:
            Maximum action-value for each state in `next_state`.

        """
        q_and_sigma = self.target_approximator.predict(next_state).squeeze()

        q = q_and_sigma[0, :, :]
        sigma = q_and_sigma[1, :, :]
        for i in range(q.shape[0]):
            if absorbing[i]:
                q[i] *= 0
                sigma[i] *= self._epsilon
        max_q = np.zeros((q.shape[0]))
        max_sigma = np.zeros((q.shape[0]))
        probs = []
        prob_explore = np.zeros(q.shape[0])
        for i in range(q.shape[0]):  # for each batch
            means = q[i, :]
            sigmas = sigma[i, :]
            prob = GaussianDQN._compute_prob_max(means, sigmas)
            probs.append(prob)
            prob_explore[i] = 1. - np.max(prob)

        if self.update_type == 'mean':
            best_actions = np.argmax(q, axis=1)
            for i in range(q.shape[0]):
                max_q[i] = q[i, best_actions[i]]
                max_sigma[i] = sigma[i, best_actions[i]]
        elif self.update_type == 'weighted':
            for i in range(q.shape[0]):  # for each batch
                means = q[i, :]
                sigmas = sigma[i, :]
                prob = probs[i]
                max_q[i] = np.sum(means * prob)
                max_sigma[i] = np.sum(sigmas * prob)
        elif self.update_type == 'optimistic':
            for i in range(q.shape[0]):  # for each batch
                means = q[i, :]
                sigmas = sigma[i, :]
                bounds = sigmas * self.standard_bound + means
                bounds = np.clip(bounds, -self.q_max, self.q_max)
                next_index = np.random.choice(
                    np.argwhere(bounds == np.max(bounds)).ravel())
                max_q[i] = q[i, next_index]
                max_sigma[i] = sigma[i, next_index]
        else:
            raise ValueError("Update type not implemented")

        return max_q, max_sigma, np.mean(prob_explore)

    def draw_action(self, state):
        action = super(GaussianDQN, self).draw_action(np.array(state))

        return action

    def episode_start(self):
        return
예제 #12
0
class Optimistic_AC(Agent):
    def __init__(self, approximator, policy, mdp_info, batch_size,
                 target_update_frequency, initial_replay_size,
                 max_replay_size, fit_params=None, approximator_params=None,
                 n_approximators=1, clip_reward=True,
                 weighted_update=False, update_type='weighted', delta=0.1,
                 q_max=100, store_prob=False, max_spread=None):
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency
        self.weighted_update = weighted_update
        self.update_type = update_type
        self.q_max = q_max
        self.store_prob = store_prob
        self.max_spread = max_spread
        quantiles = [i * 1. / (n_approximators - 1) for i in range(n_approximators)]
        for p in range(n_approximators):
            if quantiles[p] >= 1 - delta:
                self.delta_index = p
                break

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        self._n_updates = 0

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

        super(Optimistic_AC, self).__init__(policy, mdp_info)
    
    @staticmethod
    def _compute_prob_max(q_list):
        q_array = np.array(q_list).T
        score = (q_array[:, :, None, None] >= q_array).astype(int)
        prob = score.sum(axis=3).prod(axis=2).sum(axis=1)
        prob = prob.astype(np.float32)
        return prob / np.sum(prob)

    @staticmethod
    def scale(x, out_range=(-1, 1), axis=None):
        domain = np.min(x, axis), np.max(x, axis)
        y = (x - (domain[1] + domain[0]) / 2) / (domain[1] - domain[0])
        return y * (out_range[1] - out_range[0]) + (out_range[1] + out_range[0]) / 2

    def fit(self, dataset):
        mask = np.ones((len(dataset), self._n_approximators))
        self._replay_memory.add(dataset, mask)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, mask =\
                self._replay_memory.get(self._batch_size)
            self.policy.update(state, self.approximator)
            if self.update_type == 'ensemble_policy':
                self.ensemble_policy.update(state, self.approximator)
            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next, prob_explore = self._next_q(next_state, absorbing)

            if self.max_spread is not None:
                for i in range(q_next.shape[0]): #for each batch
                    min_range = np.min(q_next[i])
                    max_range = np.max(q_next[i])
                    if max_range - min_range > self.max_spread:
                        clip_range = (max_range - min_range) - self.max_spread
                        out_range = [min_range + clip_range / 2, max_range - clip_range / 2]
                        q_next[i] = Optimistic_AC.scale(q_next[i], out_range=out_range, axis=None)
            q = reward.reshape(self._batch_size,
                               1) + self.mdp_info.gamma * q_next

            margin = 0.05
            self.approximator.fit(state, action, q, mask=mask,
                                  prob_exploration=prob_explore,
                                  **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                `next_state`.

        Returns:
            Maximum action-value for each state in `next_state`.

        """
        if self.update_type == 'sarsa':
            a = self.policy.predict(next_state)
            q = np.array(self.target_approximator.predict(next_state, a))[0]
            for i in range(q.shape[1]):
                if absorbing[i]:
                    q[:, i, :] *= 0
            return q, 0
        elif self.update_type == 'ensemble_policy':
            max_q = []
            for b in range(next_state.shape[0]):
                s = next_state[b]
                actions = self.ensemble_policy.predict(s) # num_policies x num_particles
                q = np.array(self.target_approximator.predict([s] * actions.shape[0], actions))[0]
                particles = q[:, :]
                particles = np.sort(particles, axis=0)
                means = np.mean(particles, axis=0)
                bounds = means + particles[self.delta_index, :]
                bounds = np.clip(bounds, -self.q_max, self.q_max)
                next_index = np.random.choice(np.argwhere(bounds == np.max(bounds)).ravel())
                max_q[b, :] = particles[:, next_index]
            return max_q, 0
        else:
            raise ValueError("Update type not supported")

    def draw_action(self, state):
        action = super(Optimistic_AC, self).draw_action(np.array(state))

        return action

    def episode_start(self):
        self.policy.set_idx(np.random.randint(self._n_approximators))
예제 #13
0
class BootstrappedDQN(Agent):
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 batch_size,
                 target_update_frequency,
                 initial_replay_size,
                 max_replay_size,
                 fit_params=None,
                 approximator_params=None,
                 n_approximators=1,
                 clip_reward=True,
                 p_mask=2 / 3.):
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency

        self._p_mask = p_mask

        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        self._n_updates = 0
        self._episode_steps = 0

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

        super(BootstrappedDQN, self).__init__(policy, mdp_info)

    def fit(self, dataset):
        mask = np.random.binomial(1,
                                  self._p_mask,
                                  size=(len(dataset), self._n_approximators))
        self._replay_memory.add(dataset, mask)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, mask =\
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next = self._next_q(next_state, absorbing)
            q = reward.reshape(self._batch_size,
                               1) + self.mdp_info.gamma * q_next

            self.approximator.fit(state,
                                  action,
                                  q,
                                  mask=mask,
                                  **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.
        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                `next_state`.
        Returns:
            Maximum action-value for each state in `next_state`.
        """
        q = np.array(self.target_approximator.predict(next_state))[0]
        for i in range(q.shape[1]):
            if absorbing[i]:
                q[:, i, :] *= 1. - absorbing[i]

        max_q = np.max(q, axis=2)

        return max_q.T

    def draw_action(self, state):
        action = super(BootstrappedDQN, self).draw_action(np.array(state))

        self._episode_steps += 1

        return action

    def episode_start(self):

        self._episode_steps = 0
        self.policy.set_idx(np.random.randint(self._n_approximators))