示例#1
0
    def act(self,
            observation,
            stochastic,
            update_eps,
            mask,
            training_attacker,
            select_actions: bool = True):
        observation = ptu.send_to_gpu(observation, self.gpu)
        if training_attacker:
            mask = ptu.send_to_gpu(mask, self.gpu)

        batch_dim = observation.shape[0]
        device = observation.device

        q_values = self.q(self.s_encoder(observation))

        # If we're training the attacker, apply a mask to the actions.
        if training_attacker:
            q_values += mask

        # Return the raw Q values if we shouldn't select an action.
        if not select_actions:
            return q_values.detach().numpy()

        deterministic_actions = torch.argmax(q_values, dim=1)

        # Select actions from the Q values.
        if stochastic:
            random_actions = Uniform(0, 1)
            random_actions = random_actions.sample(
                [batch_dim, self.action_dim])
            random_actions = random_actions.to(device)
            if training_attacker:
                random_actions += mask
            random_actions = torch.argmax(random_actions, dim=1)

            # Epsilon greedy action selection.
            choose_random = Uniform(0, 1)
            choose_random = choose_random.sample([batch_dim]) < self.eps
            choose_random = choose_random.to(device)
            actions = torch.where(choose_random, random_actions,
                                  deterministic_actions)
        else:
            actions = deterministic_actions

        # Maybe update epsilon.
        if update_eps >= 0:
            self.eps = update_eps

        return actions.cpu().numpy()
示例#2
0
    def __post_init__(self):
        nn.Module.__init__(self)

        self.s_encoder = MLP(input_size=self.state_dim,
                             hidden_sizes=self.hidden_sizes,
                             output_size=self.state_embed_dim)
        self.s_encoder = ptu.send_to_gpu(self.s_encoder, self.gpu)
        self.s_decoder = MLP(input_size=self.state_embed_dim,
                             hidden_sizes=self.hidden_sizes,
                             output_size=self.state_dim)
        self.s_decoder = ptu.send_to_gpu(self.s_decoder, self.gpu)

        def _build_q_fn():
            q = MLP(input_size=self.state_embed_dim,
                    hidden_sizes=[self.state_embed_dim, self.state_embed_dim],
                    output_size=self.action_dim)
            q = ptu.send_to_gpu(q, self.gpu)
            return q

        self.q = _build_q_fn()
        self.q_target = _build_q_fn()
        self.mse_loss = nn.MSELoss()
        self.q_optimizer = torch.optim.Adam(list(self.q.parameters()),
                                            lr=self.q_lr)
        self.encoder_optimizer = torch.optim.Adam(
            list(self.s_encoder.parameters()) +
            list(self.s_decoder.parameters()),
            lr=self.encoder_lr)
        # Set target Q network's weights to be the same as the Q network.
        self.update_target_network()

        # Optionally load old version of state encoder/decoder
        if self.state_encoder_load_path is None:
            # If we didn' specify a particular state AE to load, then check
            # and see if there is a previous epoch's model to load.
            state_ae = _maybe_load_last_epochs_model(
                is_attacker=self.is_attacker)
        else:
            state_ae = torch.load(self.state_encoder_load_path)

        if state_ae is not None:
            self.s_encoder.load_state_dict(state_ae.s_encoder.state_dict())
            self.s_decoder.load_state_dict(state_ae.s_decoder.state_dict())

        # TODO(max): Implement `build_act_with_param_noise`.
        assert not self.parameter_noise, "Parameter noise not implemented."
示例#3
0
    def __post_init__(self):
        nn.Module.__init__(self)
        self.q = MLP(
            input_size=self.input_size,
            hidden_sizes=self.hidden_sizes,
            output_size=self.output_size)
        self.q = ptu.send_to_gpu(self.q, self.gpu)
        self.q_target = MLP(
            input_size=self.input_size,
            hidden_sizes=self.hidden_sizes,
            output_size=self.output_size)
        self.q_target = ptu.send_to_gpu(self.q_target, self.gpu)
        self.optimizer = torch.optim.Adam(
            params=self.q.parameters(),
            lr=self.lr)
        # Set target Q network's weights to be the same as the Q network.
        self.update_target_network()

        # TODO(max): Implement `build_act_with_param_noise`.
        assert not self.parameter_noise, "Parameter noise not implemented."
示例#4
0
    def update(self, observations, actions, rewards, next_observations,
               done_mask, importance_weights, mask, training_attacker,
               summary_writer, t, **kwargs):
        observations = ptu.send_to_gpu(observations, self.gpu)
        actions = ptu.send_to_gpu(actions, self.gpu).long()
        rewards = ptu.send_to_gpu(rewards, self.gpu)
        next_observations = ptu.send_to_gpu(next_observations, self.gpu)
        done_mask = ptu.send_to_gpu(done_mask, self.gpu)
        importance_weights = ptu.send_to_gpu(importance_weights, self.gpu)
        if training_attacker:
            mask = ptu.send_to_gpu(mask, self.gpu)

        log_prefix = "attacker" if training_attacker else "defender"

        def _log_scalar(key, value):
            summary_writer.add_scalar(f"{log_prefix}/{key}", value, t)

        batch_dim = observations.shape[0]
        batch_range = ptu.send_to_gpu(torch.arange(0, batch_dim),
                                      self.gpu).long()

        # Target Q value, the return from the current state.
        # For double q:
        #   \hat{a} \gets \argmax_{a_{t+1}} Q(o_{t+1}, a_{t+1} | \theta)
        #   y \gets r_t + \gamma Q(o_{t+1}, \hat{a}_{t+1} | \theta^{-})
        q_next = self.q_target(self.s_encoder(next_observations).detach())

        if self.double_q:  # https://arxiv.org/abs/1509.06461 Eqn. 4.
            best_actions = self.q(self.s_encoder(next_observations).detach())
            if training_attacker:
                best_actions += mask
            best_actions = torch.argmax(best_actions, dim=-1)
            q_next = q_next[batch_range, best_actions]

        else:
            q_next = q_next.max(1)[0]

        q_next = (1.0 - done_mask) * q_next
        q_target = rewards + self.gamma * q_next

        # Actual Q value.
        # \hat{y} \gets Q(o_t, a_t | \theta)
        q_pred = self.q(self.s_encoder(observations).detach())
        q_pred = q_pred[batch_range, actions]

        # Compute the TD error with Huber loss.
        dqn_loss = F.smooth_l1_loss(q_pred,
                                    q_target.detach(),
                                    reduction="mean")
        _log_scalar("dqn_loss", dqn_loss.item())

        # Compute reconstruction loss.
        # Reconstruction loss.
        o_hat = self.s_decoder(self.s_encoder(observations))
        op_hat = self.s_decoder(self.s_encoder(next_observations))
        reconstruction_loss = self.mse_loss(
            o_hat, observations) + self.mse_loss(op_hat, next_observations)
        _log_scalar("reconstruction_loss", reconstruction_loss.item())

        if self.grad_norm_clipping is not None:
            parameters = list(self.q.parameters())
            parameters += list(self.s_encoder.parameters())
            parameters += list(self.s_decoder.parameters())
            nn.utils.clip_grad_norm_(parameters, self.grad_norm_clipping)

        # Perform update on Q network.
        self.q_optimizer.zero_grad()
        dqn_loss.backward()
        self.q_optimizer.step()

        # Perform encoder update.
        self.encoder_optimizer.zero_grad()
        reconstruction_loss.backward()
        self.encoder_optimizer.step()
示例#5
0
 def _build_q_fn():
     q = MLP(input_size=self.state_embed_dim,
             hidden_sizes=[self.state_embed_dim, self.state_embed_dim],
             output_size=self.action_dim)
     q = ptu.send_to_gpu(q, self.gpu)
     return q
示例#6
0
    def update(self, observations, actions, rewards, next_observations, done_mask, importance_weights, mask, training_attacker, summary_writer, t, **kwargs):
        """ Update the model's parameters based off a batch of experiences.

        :param observations:
        :param actions:
        :param rewards:
        :param next_observations:
        :param done_mask:
        :param importance_weights: A per-experience importance weighting.
        :param mask: An mask of the available actions at `next_observation`.
        :param training_attacker: Is the model we are training an attacker.
        :param summary_writer: TensorboardX SummaryWriter to report loss metrics.
        :param t: Current timestep.
        """
        observations = ptu.send_to_gpu(observations, self.gpu)
        actions = ptu.send_to_gpu(actions, self.gpu).long()
        rewards = ptu.send_to_gpu(rewards, self.gpu)
        next_observations = ptu.send_to_gpu(next_observations, self.gpu)
        done_mask = ptu.send_to_gpu(done_mask, self.gpu)
        importance_weights = ptu.send_to_gpu(importance_weights, self.gpu)
        if training_attacker:
            mask = ptu.send_to_gpu(mask, self.gpu)

        log_prefix = "attacker" if training_attacker else "defender"
        def _log_scalar(key, value):
            summary_writer.add_scalar(f"{log_prefix}/{key}", value, t)

        batch_dim = observations.shape[0]
        batch_range = ptu.send_to_gpu(torch.arange(0, batch_dim), self.gpu).long()

        # Target Q value, the return from the current state.
        # For double q:
        #   \hat{a} \gets \argmax_{a_{t+1}} Q(o_{t+1}, a_{t+1} | \theta)
        #   y \gets r_t + \gamma Q(o_{t+1}, \hat{a}_{t+1} | \theta^{-})
        q_next = self.q_target(next_observations)

        if self.double_q:  # https://arxiv.org/abs/1509.06461 Eqn. 4.
            best_actions = self.q(next_observations)
            if training_attacker:
                best_actions += mask
            best_actions = torch.argmax(best_actions, dim=-1)
            q_next = q_next[batch_range, best_actions]

        else:
            q_next = q_next.max(1)[0]

        q_next = (1.0 - done_mask)*q_next
        q_target = rewards + self.gamma * q_next

        # Actual Q value.
        # \hat{y} \gets Q(o_t, a_t | \theta)
        q_pred = self.q(observations)
        q_pred = q_pred[batch_range, actions]

        # Compute the TD error with Huber loss.
        loss = F.smooth_l1_loss(q_pred, q_target.detach(), reduction="mean")
        _log_scalar("loss", loss.item())

        if self.grad_norm_clipping is not None:
            nn.utils.clip_grad_norm_(self.q.parameters(), self.grad_norm_clipping)

        # Perform update on Q network.
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()