Exemplo n.º 1
0
    def __init__(
            self,
            observation_space,
            action_space,
            actor_class=QActor,
            actor_kwargs={},
            actor_param_class=ParamActor,
            actor_param_kwargs={},
            epsilon_initial=1.0,
            epsilon_final=0.05,
            epsilon_steps=10000,
            batch_size=64,
            gamma=0.99,
            tau_actor=0.01,  # Polyak averaging factor for copying target weights
            tau_actor_param=0.001,
            replay_memory_size=1000000,
            learning_rate_actor=0.0001,
            learning_rate_actor_param=0.00001,
            initial_memory_threshold=0,
            use_ornstein_noise=False,  # if false, uses epsilon-greedy with uniform-random action-parameter exploration
            loss_func=F.mse_loss,  # F.mse_loss
            clip_grad=10,
            inverting_gradients=False,
            zero_index_gradients=False,
            indexed=False,
            weighted=False,
            average=False,
            random_weighted=False,
            device="cuda" if torch.cuda.is_available() else "cpu",
            seed=None):
        super(PDQNAgent, self).__init__(observation_space, action_space)
        self.device = torch.device(device)
        self.num_actions = self.action_space.spaces[0].n
        self.action_parameter_sizes = np.array([
            self.action_space.spaces[i].shape[0]
            for i in range(1, self.num_actions + 1)
        ])
        self.action_parameter_size = int(self.action_parameter_sizes.sum())
        self.action_max = torch.from_numpy(np.ones(
            (self.num_actions, ))).float().to(device)
        self.action_min = -self.action_max.detach()
        self.action_range = (self.action_max - self.action_min).detach()
        print([
            self.action_space.spaces[i].high
            for i in range(1, self.num_actions + 1)
        ])
        self.action_parameter_max_numpy = np.concatenate([
            self.action_space.spaces[i].high
            for i in range(1, self.num_actions + 1)
        ]).ravel()
        self.action_parameter_min_numpy = np.concatenate([
            self.action_space.spaces[i].low
            for i in range(1, self.num_actions + 1)
        ]).ravel()
        self.action_parameter_range_numpy = (self.action_parameter_max_numpy -
                                             self.action_parameter_min_numpy)
        self.action_parameter_max = torch.from_numpy(
            self.action_parameter_max_numpy).float().to(device)
        self.action_parameter_min = torch.from_numpy(
            self.action_parameter_min_numpy).float().to(device)
        self.action_parameter_range = torch.from_numpy(
            self.action_parameter_range_numpy).float().to(device)
        self.epsilon = epsilon_initial
        self.epsilon_initial = epsilon_initial
        self.epsilon_final = epsilon_final
        self.epsilon_steps = epsilon_steps
        self.indexed = indexed
        self.weighted = weighted
        self.average = average
        self.random_weighted = random_weighted
        assert (weighted ^ average ^ random_weighted
                ) or not (weighted or average or random_weighted)

        self.action_parameter_offsets = self.action_parameter_sizes.cumsum()
        self.action_parameter_offsets = np.insert(
            self.action_parameter_offsets, 0, 0)

        self.batch_size = batch_size
        self.gamma = gamma
        self.replay_memory_size = replay_memory_size
        self.initial_memory_threshold = initial_memory_threshold
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_actor_param = learning_rate_actor_param
        self.inverting_gradients = inverting_gradients
        self.tau_actor = tau_actor
        self.tau_actor_param = tau_actor_param
        self._step = 0
        self._episode = 0
        self.updates = 0
        self.clip_grad = clip_grad
        self.zero_index_gradients = zero_index_gradients

        self.np_random = None
        self.seed = seed
        self._seed(seed)

        self.use_ornstein_noise = use_ornstein_noise
        self.noise = OrnsteinUhlenbeckActionNoise(
            self.action_parameter_size,
            random_machine=self.np_random,
            mu=0.,
            theta=0.15,
            sigma=0.0001)  #, theta=0.01, sigma=0.01)

        print(self.num_actions + self.action_parameter_size)
        self.replay_memory = Memory(replay_memory_size,
                                    observation_space.shape,
                                    (1 + self.action_parameter_size, ),
                                    next_actions=False)
        self.actor = actor_class(self.observation_space.shape[0],
                                 self.num_actions, self.action_parameter_size,
                                 **actor_kwargs).to(device)
        self.actor_target = actor_class(self.observation_space.shape[0],
                                        self.num_actions,
                                        self.action_parameter_size,
                                        **actor_kwargs).to(device)
        hard_update_target_network(self.actor, self.actor_target)
        self.actor_target.eval()

        self.actor_param = actor_param_class(self.observation_space.shape[0],
                                             self.num_actions,
                                             self.action_parameter_size,
                                             **actor_param_kwargs).to(device)
        self.actor_param_target = actor_param_class(
            self.observation_space.shape[0], self.num_actions,
            self.action_parameter_size, **actor_param_kwargs).to(device)
        hard_update_target_network(self.actor_param, self.actor_param_target)
        self.actor_param_target.eval()

        self.loss_func = loss_func  # l1_smooth_loss performs better but original paper used MSE

        # Original DDPG paper [Lillicrap et al. 2016] used a weight decay of 0.01 for Q (critic)
        # but setting weight_decay=0.01 on the critic_optimiser seems to perform worse...
        # using AMSgrad ("fixed" version of Adam, amsgrad=True) doesn't seem to help either...
        self.actor_optimiser = optim.Adam(
            self.actor.parameters(),
            lr=self.learning_rate_actor)  #, betas=(0.95, 0.999))
        self.actor_param_optimiser = optim.Adam(
            self.actor_param.parameters(), lr=self.learning_rate_actor_param
        )  #, betas=(0.95, 0.999)) #, weight_decay=critic_l2_reg)
Exemplo n.º 2
0
class PDQNAgent(Agent):
    """
    DDPG actor-critic agent for parameterised action spaces
    [Hausknecht and Stone 2016]
    """

    NAME = "P-DQN Agent"

    def __init__(
            self,
            observation_space,
            action_space,
            actor_class=QActor,
            actor_kwargs={},
            actor_param_class=ParamActor,
            actor_param_kwargs={},
            epsilon_initial=1.0,
            epsilon_final=0.05,
            epsilon_steps=10000,
            batch_size=64,
            gamma=0.99,
            tau_actor=0.01,  # Polyak averaging factor for copying target weights
            tau_actor_param=0.001,
            replay_memory_size=1000000,
            learning_rate_actor=0.0001,
            learning_rate_actor_param=0.00001,
            initial_memory_threshold=0,
            use_ornstein_noise=False,  # if false, uses epsilon-greedy with uniform-random action-parameter exploration
            loss_func=F.mse_loss,  # F.mse_loss
            clip_grad=10,
            inverting_gradients=False,
            zero_index_gradients=False,
            indexed=False,
            weighted=False,
            average=False,
            random_weighted=False,
            device="cuda" if torch.cuda.is_available() else "cpu",
            seed=None):
        super(PDQNAgent, self).__init__(observation_space, action_space)
        self.device = torch.device(device)
        self.num_actions = self.action_space.spaces[0].n
        self.action_parameter_sizes = np.array([
            self.action_space.spaces[i].shape[0]
            for i in range(1, self.num_actions + 1)
        ])
        self.action_parameter_size = int(self.action_parameter_sizes.sum())
        self.action_max = torch.from_numpy(np.ones(
            (self.num_actions, ))).float().to(device)
        self.action_min = -self.action_max.detach()
        self.action_range = (self.action_max - self.action_min).detach()
        print([
            self.action_space.spaces[i].high
            for i in range(1, self.num_actions + 1)
        ])
        self.action_parameter_max_numpy = np.concatenate([
            self.action_space.spaces[i].high
            for i in range(1, self.num_actions + 1)
        ]).ravel()
        self.action_parameter_min_numpy = np.concatenate([
            self.action_space.spaces[i].low
            for i in range(1, self.num_actions + 1)
        ]).ravel()
        self.action_parameter_range_numpy = (self.action_parameter_max_numpy -
                                             self.action_parameter_min_numpy)
        self.action_parameter_max = torch.from_numpy(
            self.action_parameter_max_numpy).float().to(device)
        self.action_parameter_min = torch.from_numpy(
            self.action_parameter_min_numpy).float().to(device)
        self.action_parameter_range = torch.from_numpy(
            self.action_parameter_range_numpy).float().to(device)
        self.epsilon = epsilon_initial
        self.epsilon_initial = epsilon_initial
        self.epsilon_final = epsilon_final
        self.epsilon_steps = epsilon_steps
        self.indexed = indexed
        self.weighted = weighted
        self.average = average
        self.random_weighted = random_weighted
        assert (weighted ^ average ^ random_weighted
                ) or not (weighted or average or random_weighted)

        self.action_parameter_offsets = self.action_parameter_sizes.cumsum()
        self.action_parameter_offsets = np.insert(
            self.action_parameter_offsets, 0, 0)

        self.batch_size = batch_size
        self.gamma = gamma
        self.replay_memory_size = replay_memory_size
        self.initial_memory_threshold = initial_memory_threshold
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_actor_param = learning_rate_actor_param
        self.inverting_gradients = inverting_gradients
        self.tau_actor = tau_actor
        self.tau_actor_param = tau_actor_param
        self._step = 0
        self._episode = 0
        self.updates = 0
        self.clip_grad = clip_grad
        self.zero_index_gradients = zero_index_gradients

        self.np_random = None
        self.seed = seed
        self._seed(seed)

        self.use_ornstein_noise = use_ornstein_noise
        self.noise = OrnsteinUhlenbeckActionNoise(
            self.action_parameter_size,
            random_machine=self.np_random,
            mu=0.,
            theta=0.15,
            sigma=0.0001)  #, theta=0.01, sigma=0.01)

        print(self.num_actions + self.action_parameter_size)
        self.replay_memory = Memory(replay_memory_size,
                                    observation_space.shape,
                                    (1 + self.action_parameter_size, ),
                                    next_actions=False)
        self.actor = actor_class(self.observation_space.shape[0],
                                 self.num_actions, self.action_parameter_size,
                                 **actor_kwargs).to(device)
        self.actor_target = actor_class(self.observation_space.shape[0],
                                        self.num_actions,
                                        self.action_parameter_size,
                                        **actor_kwargs).to(device)
        hard_update_target_network(self.actor, self.actor_target)
        self.actor_target.eval()

        self.actor_param = actor_param_class(self.observation_space.shape[0],
                                             self.num_actions,
                                             self.action_parameter_size,
                                             **actor_param_kwargs).to(device)
        self.actor_param_target = actor_param_class(
            self.observation_space.shape[0], self.num_actions,
            self.action_parameter_size, **actor_param_kwargs).to(device)
        hard_update_target_network(self.actor_param, self.actor_param_target)
        self.actor_param_target.eval()

        self.loss_func = loss_func  # l1_smooth_loss performs better but original paper used MSE

        # Original DDPG paper [Lillicrap et al. 2016] used a weight decay of 0.01 for Q (critic)
        # but setting weight_decay=0.01 on the critic_optimiser seems to perform worse...
        # using AMSgrad ("fixed" version of Adam, amsgrad=True) doesn't seem to help either...
        self.actor_optimiser = optim.Adam(
            self.actor.parameters(),
            lr=self.learning_rate_actor)  #, betas=(0.95, 0.999))
        self.actor_param_optimiser = optim.Adam(
            self.actor_param.parameters(), lr=self.learning_rate_actor_param
        )  #, betas=(0.95, 0.999)) #, weight_decay=critic_l2_reg)

    def __str__(self):
        desc = super().__str__() + "\n"
        desc += "Actor Network {}\n".format(self.actor) + \
                "Param Network {}\n".format(self.actor_param) + \
                "Actor Alpha: {}\n".format(self.learning_rate_actor) + \
                "Actor Param Alpha: {}\n".format(self.learning_rate_actor_param) + \
                "Gamma: {}\n".format(self.gamma) + \
                "Tau (actor): {}\n".format(self.tau_actor) + \
                "Tau (actor-params): {}\n".format(self.tau_actor_param) + \
                "Inverting Gradients: {}\n".format(self.inverting_gradients) + \
                "Replay Memory: {}\n".format(self.replay_memory_size) + \
                "Batch Size: {}\n".format(self.batch_size) + \
                "Initial memory: {}\n".format(self.initial_memory_threshold) + \
                "epsilon_initial: {}\n".format(self.epsilon_initial) + \
                "epsilon_final: {}\n".format(self.epsilon_final) + \
                "epsilon_steps: {}\n".format(self.epsilon_steps) + \
                "Clip Grad: {}\n".format(self.clip_grad) + \
                "Ornstein Noise?: {}\n".format(self.use_ornstein_noise) + \
                "Zero Index Grads?: {}\n".format(self.zero_index_gradients) + \
                "Seed: {}\n".format(self.seed)
        return desc

    def set_action_parameter_passthrough_weights(self,
                                                 initial_weights,
                                                 initial_bias=None):
        passthrough_layer = self.actor_param.action_parameters_passthrough_layer
        print(initial_weights.shape)
        print(passthrough_layer.weight.data.size())
        assert initial_weights.shape == passthrough_layer.weight.data.size()
        passthrough_layer.weight.data = torch.Tensor(
            initial_weights).float().to(self.device)
        if initial_bias is not None:
            print(initial_bias.shape)
            print(passthrough_layer.bias.data.size())
            assert initial_bias.shape == passthrough_layer.bias.data.size()
            passthrough_layer.bias.data = torch.Tensor(
                initial_bias).float().to(self.device)
        passthrough_layer.requires_grad = False
        passthrough_layer.weight.requires_grad = False
        passthrough_layer.bias.requires_grad = False
        hard_update_target_network(self.actor_param, self.actor_param_target)

    def _seed(self, seed=None):
        """
        NOTE: this will not reset the randomly initialised weights; use the seed parameter in the constructor instead.

        :param seed:
        :return:
        """
        self.seed = seed
        random.seed(seed)
        np.random.seed(seed)
        self.np_random = np.random.RandomState(seed=seed)
        if seed is not None:
            torch.manual_seed(seed)
            if self.device == torch.device("cuda"):
                torch.cuda.manual_seed(seed)

    def _ornstein_uhlenbeck_noise(self, all_action_parameters):
        """ Continuous action exploration using an Ornstein–Uhlenbeck process. """
        return all_action_parameters.data.numpy() + (
            self.noise.sample() * self.action_parameter_range_numpy)

    def start_episode(self):
        pass

    def end_episode(self):
        self._episode += 1

        ep = self._episode
        if ep < self.epsilon_steps:
            self.epsilon = self.epsilon_initial - (self.epsilon_initial -
                                                   self.epsilon_final) * (
                                                       ep / self.epsilon_steps)
        else:
            self.epsilon = self.epsilon_final

    def act(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).to(self.device)
            all_action_parameters = self.actor_param.forward(state)

            # Hausknecht and Stone [2016] use epsilon greedy actions with uniform random action-parameter exploration
            rnd = self.np_random.uniform()
            if rnd < self.epsilon:
                action = self.np_random.choice(self.num_actions)
                if not self.use_ornstein_noise:
                    all_action_parameters = torch.from_numpy(
                        np.random.uniform(self.action_parameter_min_numpy,
                                          self.action_parameter_max_numpy))
            else:
                # select maximum action
                Q_a = self.actor.forward(state.unsqueeze(0),
                                         all_action_parameters.unsqueeze(0))
                Q_a = Q_a.detach().cpu().data.numpy()
                action = np.argmax(Q_a)

            # add noise only to parameters of chosen action
            all_action_parameters = all_action_parameters.cpu().data.numpy()
            offset = np.array(
                [self.action_parameter_sizes[i] for i in range(action)],
                dtype=int).sum()
            if self.use_ornstein_noise and self.noise is not None:
                all_action_parameters[
                    offset:offset +
                    self.action_parameter_sizes[action]] += self.noise.sample(
                    )[offset:offset + self.action_parameter_sizes[action]]
            action_parameters = all_action_parameters[
                offset:offset + self.action_parameter_sizes[action]]

        return action, action_parameters, all_action_parameters

    def _zero_index_gradients(self, grad, batch_action_indices, inplace=True):
        assert grad.shape[0] == batch_action_indices.shape[0]
        grad = grad.cpu()

        if not inplace:
            grad = grad.clone()
        with torch.no_grad():
            ind = torch.zeros(self.action_parameter_size, dtype=torch.long)
            for a in range(self.num_actions):
                ind[self.action_parameter_offsets[a]:self.
                    action_parameter_offsets[a + 1]] = a
            # ind_tile = np.tile(ind, (self.batch_size, 1))
            ind_tile = ind.repeat(self.batch_size, 1).to(self.device)
            actual_index = ind_tile != batch_action_indices[:, np.newaxis]
            grad[actual_index] = 0.
        return grad

    def _invert_gradients(self, grad, vals, grad_type, inplace=True):
        # 5x faster on CPU (for Soccer, slightly slower for Goal, Platform?)
        if grad_type == "actions":
            max_p = self.action_max
            min_p = self.action_min
            rnge = self.action_range
        elif grad_type == "action_parameters":
            max_p = self.action_parameter_max
            min_p = self.action_parameter_min
            rnge = self.action_parameter_range
        else:
            raise ValueError("Unhandled grad_type: '" + str(grad_type) + "'")

        max_p = max_p.cpu()
        min_p = min_p.cpu()
        rnge = rnge.cpu()
        grad = grad.cpu()
        vals = vals.cpu()

        assert grad.shape == vals.shape

        if not inplace:
            grad = grad.clone()
        with torch.no_grad():
            # index = grad < 0  # actually > but Adam minimises, so reversed (could also double negate the grad)
            index = grad > 0
            grad[index] *= (index.float() * (max_p - vals) / rnge)[index]
            grad[~index] *= ((~index).float() * (vals - min_p) / rnge)[~index]

        return grad

    def step(self,
             state,
             action,
             reward,
             next_state,
             next_action,
             terminal,
             time_steps=1):
        act, all_action_parameters = action
        self._step += 1

        # self._add_sample(state, np.concatenate((all_actions.data, all_action_parameters.data)).ravel(), reward, next_state, terminal)
        self._add_sample(state,
                         np.concatenate(
                             ([act], all_action_parameters)).ravel(),
                         reward,
                         next_state,
                         np.concatenate(
                             ([next_action[0]], next_action[1])).ravel(),
                         terminal=terminal)
        if self._step >= self.batch_size and self._step >= self.initial_memory_threshold:
            self._optimize_td_loss()
            self.updates += 1

    def _add_sample(self, state, action, reward, next_state, next_action,
                    terminal):
        assert len(action) == 1 + self.action_parameter_size
        self.replay_memory.append(state,
                                  action,
                                  reward,
                                  next_state,
                                  terminal=terminal)

    def _optimize_td_loss(self):
        if self._step < self.batch_size or self._step < self.initial_memory_threshold:
            return
        # Sample a batch from replay memory
        states, actions, rewards, next_states, terminals = self.replay_memory.sample(
            self.batch_size, random_machine=self.np_random)

        states = torch.from_numpy(states).to(self.device)
        actions_combined = torch.from_numpy(actions).to(
            self.device)  # make sure to separate actions and parameters
        actions = actions_combined[:, 0].long()
        action_parameters = actions_combined[:, 1:]
        rewards = torch.from_numpy(rewards).to(self.device).squeeze()
        next_states = torch.from_numpy(next_states).to(self.device)
        terminals = torch.from_numpy(terminals).to(self.device).squeeze()

        # ---------------------- optimize Q-network ----------------------
        with torch.no_grad():
            pred_next_action_parameters = self.actor_param_target.forward(
                next_states)
            pred_Q_a = self.actor_target(next_states,
                                         pred_next_action_parameters)
            Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze()

            # Compute the TD error
            target = rewards + (1 - terminals) * self.gamma * Qprime

        # Compute current Q-values using policy network
        q_values = self.actor(states, action_parameters)
        y_predicted = q_values.gather(1, actions.view(-1, 1)).squeeze()
        y_expected = target
        loss_Q = self.loss_func(y_predicted, y_expected)

        self.actor_optimiser.zero_grad()
        loss_Q.backward()
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(),
                                           self.clip_grad)
        self.actor_optimiser.step()

        # ---------------------- optimize actor ----------------------
        with torch.no_grad():
            action_params = self.actor_param(states)
        action_params.requires_grad = True
        assert (self.weighted ^ self.average ^ self.random_weighted) or \
               not (self.weighted or self.average or self.random_weighted)
        Q = self.actor(states, action_params)
        Q_val = Q
        if self.weighted:
            # approximate categorical probability density (i.e. counting)
            counts = Counter(actions.cpu().numpy())
            weights = torch.from_numpy(
                np.array([
                    counts[a] / actions.shape[0]
                    for a in range(self.num_actions)
                ])).float().to(self.device)
            Q_val = weights * Q
        elif self.average:
            Q_val = Q / self.num_actions
        elif self.random_weighted:
            weights = np.random.uniform(0, 1., self.num_actions)
            weights /= np.linalg.norm(weights)
            weights = torch.from_numpy(weights).float().to(self.device)
            Q_val = weights * Q
        if self.indexed:
            Q_indexed = Q_val.gather(1, actions.unsqueeze(1))
            Q_loss = torch.mean(Q_indexed)
        else:
            Q_loss = torch.mean(torch.sum(Q_val, 1))
        self.actor.zero_grad()
        Q_loss.backward()
        from copy import deepcopy
        delta_a = deepcopy(action_params.grad.data)
        # step 2
        action_params = self.actor_param(Variable(states))
        delta_a[:] = self._invert_gradients(delta_a,
                                            action_params,
                                            grad_type="action_parameters",
                                            inplace=True)
        if self.zero_index_gradients:
            delta_a[:] = self._zero_index_gradients(
                delta_a, batch_action_indices=actions, inplace=True)

        out = -torch.mul(delta_a, action_params)
        self.actor_param.zero_grad()
        out.backward(torch.ones(out.shape).to(self.device))
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor_param.parameters(),
                                           self.clip_grad)

        self.actor_param_optimiser.step()

        soft_update_target_network(self.actor, self.actor_target,
                                   self.tau_actor)
        soft_update_target_network(self.actor_param, self.actor_param_target,
                                   self.tau_actor_param)

    def save_models(self, prefix):
        """
        saves the target actor and critic models
        :param prefix: the count of episodes iterated
        :return:
        """
        torch.save(self.actor.state_dict(), prefix + '_actor.pt')
        torch.save(self.actor_param.state_dict(), prefix + '_actor_param.pt')
        print('Models saved successfully')

    def load_models(self, prefix):
        """
        loads the target actor and critic models, and copies them onto actor and critic models
        :param prefix: the count of episodes iterated (used to find the file name)
        :param target: whether to load the target newtwork too (not necessary for evaluation)
        :return:
        """
        # also try load on CPU if no GPU available?
        self.actor.load_state_dict(
            torch.load(prefix + '_actor.pt', map_location='cpu'))
        self.actor_param.load_state_dict(
            torch.load(prefix + '_actor_param.pt', map_location='cpu'))
        print('Models loaded successfully')