示例#1
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        # TODO return the action that the policy prescribes

        self.eval()

        observation = ptu.from_numpy(observation)
        action = self(observation)
        return ptu.to_numpy(action)
示例#2
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:

        if len(obs.shape) > 1:
            observation = ptu.from_numpy(obs)
        else:
            observation = ptu.from_numpy(obs[None])

        #  return the action that the policy prescribes
        if self.discrete:
            return self.logits_na(observation).argmax().item()
        else:
            return ptu.to_numpy(self.mean_net(observation))
        raise NotImplementedError
示例#3
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        # TODO return the action that the policy prescribes
        prediction = self(ptu.from_numpy(observation))
        if self.discrete:
            res = torch.argmax(prediction)
        else:
            res = prediction.rsample()
        return ptu.to_numpy(res)
示例#4
0
    def run_baseline_prediction(self, obs):
        """
            Helper function that converts `obs` to a tensor,
            calls the forward method of the baseline MLP,
            and returns a np array

            Input: `obs`: np.ndarray of size [N, 1]
            Output: np.ndarray of size [N]

        """
        obs = ptu.from_numpy(obs)
        predictions = self.baseline(obs)
        return ptu.to_numpy(predictions)[:, 0]
示例#5
0
    def update(self, ob_no, ac_na, reward_n, next_ob_no, terminal_n):
        """
            Update the parameters of the critic.
            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories
            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
                reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
                    the reward for each timestep
                terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
                    at that timestep of 0 if the episode did not end
            returns:
                nothing
        """
        ob_no = ptu.from_numpy(ob_no)
        ac_na = ptu.from_numpy(ac_na).to(torch.long)
        next_ob_no = ptu.from_numpy(next_ob_no)
        reward_n = ptu.from_numpy(reward_n)
        terminal_n = ptu.from_numpy(terminal_n)

        qa_t_values = self.q_net(ob_no)
        q_t_values = torch.gather(qa_t_values, 1,
                                  ac_na.unsqueeze(1)).squeeze(1)

        qa_tp1_values = self.q_net_target(next_ob_no)

        if self.double_q:
            acs = self.q_net(next_ob_no).argmax(dim=1)
            q_tp1 = torch.gather(qa_tp1_values, 1, acs.unsqueeze(1)).squeeze(1)
        else:
            q_tp1, _ = qa_tp1_values.max(dim=1)
        # HINT: as you saw in lecture, this would be:
        #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal)
        target = reward_n + self.gamma * q_tp1 * (1.0 - terminal_n)
        target = target.detach()

        assert q_t_values.shape == target.shape
        loss = self.loss(q_t_values, target)

        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_value_(self.q_net.parameters(),
                               self.grad_norm_clipping)
        self.optimizer.step()

        self.learning_rate_scheduler.step()
        return {
            'Training Loss': ptu.to_numpy(loss),
        }
示例#6
0
    def update(self, observations, actions, advantages, q_values=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss
        dist = self(observations)
        loss = -torch.mean(dist.log_prob(actions) * advantages)

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            mean = np.mean(q_values)
            std = np.std(q_values)
            targets = normalize(q_values, mean, std)
            targets = ptu.from_numpy(targets)

            ## TODO: use the `forward` method of `self.baseline` to get baseline predictions
            baseline_predictions = torch.squeeze(self.baseline(observations))

            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            assert baseline_predictions.shape == targets.shape

            # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`
            baseline_loss = self.baseline_loss(baseline_predictions, targets)

            # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
示例#7
0
 def forward(self, ob_no):
     if self.hash:
         # print(ob_no)
         codes = ptu.to_numpy(self.encoder(ob_no).round())
         counts = np.zeros(len(codes))
         for i, code in enumerate(codes):
             counts[i] = self.counts[str(code)]
             self.counts[str(code)] += 1
         return 1 / np.sqrt(counts+1)
     # TODO: Get the prediction error for ob_no
     # HINT: Remember to detach the output of self.f!
     else:
         error = ((self.f.forward(ob_no).detach() - self.f_hat(ob_no)) ** 2).mean(axis=1)
     return error
示例#8
0
    def update(self, observations, actions, next_observations,
               data_statistics):
        """
        :param observations: numpy array of observations
        :param actions: numpy array of actions
        :param next_observations: numpy array of next observations
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return:
        """
        # mean = np.mean(data_statistics['delta_mean'])
        # std = np.std(data_statistics['delta_std'])
        # target = normalize(next_observations-observations, mean, std)# TODO(Q1) compute the normalized target for the model.
        # target = ptu.from_numpy(target)
        # Hint: you should use `data_statistics['delta_mean']` and
        # `data_statistics['delta_std']`, which keep track of the mean
        # and standard deviation of the model.
        obs = ptu.from_numpy(observations)
        acs = ptu.from_numpy(actions)
        # obs_mean = ptu.from_numpy(data_statistics['obs_mean'])
        # obs_std = ptu.from_numpy(data_statistics['obs_std'])
        # acs_mean = ptu.from_numpy(data_statistics['acs_mean'])
        # acs_std = ptu.from_numpy(data_statistics['acs_std'])
        # delta_mean = ptu.from_numpy(data_statistics['delta_mean'])
        # delta_std = ptu.from_numpy(data_statistics['delta_std'])
        self.update_statistics(*data_statistics.values())
        target = normalize(
            ptu.from_numpy(next_observations - observations), self.delta_mean,
            self.delta_std
        )  # TODO(Q1) compute the normalized target for the model.
        pred = self(obs, acs, self.obs_mean, self.obs_std, self.acs_mean,
                    self.acs_std, self.delta_mean, self.delta_std)[1]

        loss = self.loss(pred, target)  # TODO(Q1) compute the loss
        # Hint: `self(...)` returns a tuple, but you only need to use one of the
        # outputs.

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
示例#9
0
    def update(self, observations, actions, next_observations,
               data_statistics):
        """
        :param observations: numpy array of observations
        :param actions: numpy array of actions
        :param next_observations: numpy array of next observations
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return:
        """
        # print ('obs', type(observations))

        target = ptu.from_numpy(
            ((next_observations - observations) -
             data_statistics['delta_mean']) / data_statistics['delta_std']
        )  # TODO(Q1) compute the normalized target for the model.
        # Hint: you should use `data_statistics['delta_mean']` and
        # `data_statistics['delta_std']`, which keep track of the mean
        # and standard deviation of the model.

        self.update_statistics(data_statistics['obs_mean'],
                               data_statistics['obs_std'],
                               data_statistics['acs_mean'],
                               data_statistics['acs_std'],
                               data_statistics['delta_mean'],
                               data_statistics['delta_std'])
        # print (type(observations))
        _, output = self.forward(ptu.from_numpy(observations),
                                 ptu.from_numpy(actions), self.obs_mean,
                                 self.obs_std, self.acs_mean, self.acs_std,
                                 self.delta_mean, self.delta_std)

        loss = self.loss(target, output)  # TODO(Q1) compute the loss
        # Hint: `self(...)` returns a tuple, but you only need to use one of the
        # outputs.

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
示例#10
0
    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation ("frame") into the replay buffer
        # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer`
        # in dqn_utils.py
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)
        # todo: figure out how to use self.store_effect

        eps = self.exploration.value(self.t)

        # TODO use epsilon greedy exploration when selecting action
        perform_random_action = (np.random.random() <
                                 eps) or (self.t < self.learning_starts)
        if perform_random_action:
            # HINT: take random action
            # with probability eps (see np.random.random())
            # OR if your current step number (see self.t) is less that self.learning_starts
            action = self.env.action_space.sample()  # note: is this correct??
        else:
            # HINT: Your actor will take in multiple previous observations ("frames") in order
            # to deal with the partial observability of the environment. Get the most recent
            # `frame_history_len` observations using functionality from the replay buffer,
            # and then use those observations as input to your actor.
            frames = self.replay_buffer.encode_recent_observation()
            action = ptu.to_numpy(self.actor.get_action(frames))

        # TODO take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        #obs, reward, done, info = env.step(action)
        obs, reward, done, info = self.env.step(action)

        # TODO store the result of taking this action into the replay buffer
        # HINT1: see your replay buffer's `store_effect` function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation)
        if done:
            self.last_obs = self.env.reset()
        else:
            self.last_obs = obs
示例#11
0
    def update(self, observations, actions, next_observations,
               data_statistics):
        """
        :param observations: numpy array of observations
        :param actions: numpy array of actions
        :param next_observations: numpy array of next observations
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return:
        """
        # target = (next_observations - data_statistics["obs_mean"]) / data_statistics["obs_std"]# TODO(Q1) compute the normalized target for the model.
        delta_mean = ptu.from_numpy(data_statistics["delta_mean"]).to(
            ptu.device)
        delta_std = ptu.from_numpy(data_statistics["delta_std"]).to(ptu.device)
        target = ptu.from_numpy(next_observations - observations).to(
            ptu.device)
        target = (target - delta_mean) / delta_std

        # Hint: you should use `data_statistics['delta_mean']` and
        # `data_statistics['delta_std']`, which keep track of the mean
        # and standard deviation of the model.
        obs_mean, obs_std = data_statistics["obs_mean"], data_statistics[
            "obs_std"]
        acs_mean, acs_std = data_statistics["acs_mean"], data_statistics[
            "acs_std"]
        obs_normalized = (observations - obs_mean) / obs_std  # TODO(Q1)
        acs_normalized = (actions - acs_mean) / acs_std  # TODO(Q1)
        obs_normalized = ptu.from_numpy(obs_normalized).to(ptu.device)
        acs_normalized = ptu.from_numpy(acs_normalized).to(ptu.device)
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        y_hat = self.delta_network(concatenated_input)
        loss = self.loss(y_hat, target)  # TODO(Q1) compute the loss
        # Hint: `self(...)` returns a tuple, but you only need to use one of the
        # outputs.

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
示例#12
0
    def update(self,
               observations,
               actions,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        loss = self.loss(self(observations), actions)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return {
            # You can add extra logging information here, but keep this line
            'Training Loss': ptu.to_numpy(loss),
        }
示例#13
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        #         with torch.no_grad():
        obs = np.asarray(obs)
        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

    #         # Done todo return the action that the policy prescribes
    # #             print('selfdiscrete', self.discrete)
        if self.discrete:
            observation = ptu.from_numpy(obs)
            possible_actions = self.logits_na(observation)
            m = torch.distributions.categorical.Categorical(
                logits=possible_actions)
            action_to_take = m.sample()
            return ptu.to_numpy(action_to_take)
        else:
            obs = ptu.from_numpy(obs)
            pred_mu = self.mean_net(obs)
            std = torch.exp(self.logstd)
            eps = torch.randn_like(pred_mu)
            pred = pred_mu + eps * std
            return ptu.to_numpy(pred)
 def get_action(self, obs: np.ndarray) -> np.ndarray:
     # this obs has a dim of ob_dim + ac_dim*(1+MAX_CAND_NUM)
     if len(obs.shape) > 2:
         observation = obs
     else:
         observation = obs[None]
     observation = ptu.from_numpy(observation)
     action_distributions = self(observation)
     actions = []
     for i in range(self.n_drivers):
         action_i = ptu.to_numpy(
             action_distributions[i].sample())  # don't bother with rsample
         actions.append(action_i)
     actions = np.array(actions)
     return actions.T
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        # TODO return the action that the policy prescribes

        #Args:obs (numpy.ndarray): Observation from environment.
        #Returns:numpy.ndarray: Predicted action by forward NN. Note:return numpy array instead of tensor, may because np is more general

        action_distribution = self.forward(ptu.from_numpy(observation))
        action = action_distribution.sample()

        return ptu.to_numpy(action)
示例#16
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:

        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        # TODO return the action that the policy prescribes

        if self.discrete:
            action = self(ptu.from_numpy(observation)).sample()
        else:
            action = self(ptu.from_numpy(observation)).rsample()

        return ptu.to_numpy(action)
示例#17
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        # TODO return the action that the policy prescribes
        observation = ptu.from_numpy(observation)

        if self.discrete:
            action = self.logits_na(observation)
            action = torch.argmax(action, dim=1)
        else:
            action = self.mean_net(observation)
        return ptu.to_numpy(action)
示例#18
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        # DONE: return the action that the policy prescribes
        # Cast to PyTorch tensor
        observation = ptu.from_numpy(observation)
        if self.discrete:
            action = self.forward(observation)
            action = action.sample()
        else:
            action = self.forward(observation)
        return (ptu.to_numpy(action))
示例#19
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        # TODO: get this from hw1
        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        # TODO return the action that the policy prescribes

        self.eval()

        observation = ptu.from_numpy(observation)
        action_distribution = self(observation)
        action = action_distribution.sample()
        return ptu.to_numpy(action)
示例#20
0
 def update(self,
            observations,
            actions,
            adv_n=None,
            acs_labels_na=None,
            qvals=None):
     # TODO: update the policy and return the loss
     loss = super().update(observations,
                           actions,
                           adv_n=None,
                           acs_labels_na=None,
                           qvals=None)
     return {
         # You can add extra logging information here, but keep this line
         'Training Loss': ptu.to_numpy(loss),
     }
示例#21
0
    def update(self,
               observations,
               actions,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        # TODO: update the policy and return the loss
        actions_pred = self.get_action(observations)
        loss = self.loss(actions_pred, actions)
        loss.backward()
        self.optimizer.step()

        return {
            # You can add extra logging information here, but keep this line
            'Training Loss': ptu.to_numpy(loss),
        }
 def update(self,
            observations,
            actions,
            adv_n=None,
            acs_labels_na=None,
            qvals=None):
     # TODO: update the policy and return the loss
     self.optimizer.zero_grad()
     current_action = self.forward(ptu.from_numpy(observations))
     loss = self.loss(current_action, ptu.from_numpy(actions))
     loss.backward()
     self.optimizer.step()
     return {
         # You can add extra logging information here, but keep this line
         'Training Loss': ptu.to_numpy(loss),
     }
示例#23
0
    def forward(
        self,
        obs_unnormalized,
        acs_unnormalized,
        obs_mean,
        obs_std,
        acs_mean,
        acs_std,
        delta_mean,
        delta_std,
    ):
        """
        :param obs_unnormalized: Unnormalized observations
        :param acs_unnormalized: Unnormalized actions
        :param obs_mean: Mean of observations
        :param obs_std: Standard deviation of observations
        :param acs_mean: Mean of actions
        :param acs_std: Standard deviation of actions
        :param delta_mean: Mean of state difference `s_t+1 - s_t`.
        :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
        :return: tuple `(next_obs_pred, delta_pred_normalized)`
        This forward function should return a tuple of two items
            1. `next_obs_pred` which is the predicted `s_t+1`
            2. `delta_pred_normalized` which is the normalized (i.e. not
                unnormalized) output of the delta network. This is needed
        """
        # normalize input data to mean 0, std 1
        # obs_normalized = # TODO(Q1) -------------------
        # acs_normalized = # TODO(Q1) -------------------
        obs_normalized = ptu.from_numpy(
            normalize(obs_unnormalized, obs_mean, obs_std))
        acs_normalized = ptu.from_numpy(
            normalize(acs_unnormalized, acs_mean, acs_std))

        # predicted change in obs
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        # TODO(Q1) compute delta_pred_normalized and next_obs_pred --------------------
        # Hint: as described in the PDF, the output of the network is the
        # *normalized change* in state, i.e. normalized(s_t+1 - s_t).
        # delta_pred_normalized = # TODO(Q1) ---------------------
        delta_pred_normalized = self.delta_network(concatenated_input)
        # next_obs_pred = # TODO(Q1) --------------------
        next_obs_pred = ptu.from_numpy(obs_unnormalized) + ptu.from_numpy(
            unnormalize(ptu.to_numpy(delta_pred_normalized), delta_mean,
                        delta_std))
        return next_obs_pred, delta_pred_normalized
示例#24
0
    def update(self, observations, actions, advantages, n_rollouts=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        if self.discrete:
            actions = actions.to(torch.int64)
            # logits: (batch_size, seq_len, action_dim)
            logits = self.forward(observations)
            # log_pi: (batch_size, seq_len)
            log_pi = logits.gather(dim=-1, index=actions.unsqueeze(
                dim=-1)).squeeze(dim=-1) - logits.logsumexp(dim=-1,
                                                            keepdim=False)
        else:
            acs_mean = self.forward(observations)
            # log_pi: (batch_size, seq_len, action_dim)
            log_pi = self.normal_dist.log_prob(
                normalize(data=actions,
                          mean=acs_mean,
                          std=torch.exp(self.logstd)))
            # log_pi: (batch_size, seq_len)
            log_pi = torch.sum(log_pi, dim=-1)

        assert log_pi.shape == advantages.shape
        loss = -torch.mean(torch.sum(log_pi * advantages, dim=-1), dim=0)
        if n_rollouts is not None and advantages.dim() == 1:
            # all rollouts are concatenated, manually divided by n_rollouts to get average
            log_pi /= n_rollouts

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
示例#25
0
    def update(self, observations, actions, next_observations,
               data_statistics):
        """
        :param observations: numpy array of observations
        :param actions: numpy array of actions
        :param next_observations: numpy array of next observations
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return:
        """
        target = ptu.from_numpy(
            normalize(
                data=next_observations - observations,
                mean=data_statistics['delta_mean'],
                std=data_statistics['delta_std'],
            ))  # TODO(Q1) compute the normalized target for the model.
        # Hint: you should use `data_statistics['delta_mean']` and
        # `data_statistics['delta_std']`, which keep track of the mean
        # and standard deviation of the model.

        loss = self.loss(
            self.forward(
                obs_unnormalized=observations,
                acs_unnormalized=actions,
                obs_mean=data_statistics['obs_mean'],
                obs_std=data_statistics['obs_std'],
                acs_mean=data_statistics['acs_mean'],
                acs_std=data_statistics['acs_std'],
                delta_mean=data_statistics['delta_mean'],
                delta_std=data_statistics['delta_std'],
            )[1], target)  # TODO(Q1) compute the loss
        # Hint: `self(...)` returns a tuple, but you only need to use one of the
        # outputs.

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
 def update(self,
            observations,
            actions,
            adv_n=None,
            acs_labels_na=None,
            qvals=None):
     # TODO: update the policy and return the loss
     loss = self.loss(
         torch.from_numpy(actions).type(torch.float32),
         self.mean_net(torch.from_numpy(observations).type(torch.float32)))
     self.optimizer.zero_grad()
     loss.backward()
     self.optimizer.step()
     return {
         # You can add extra logging information here, but keep this line
         'Training Loss': ptu.to_numpy(loss),
     }
示例#27
0
 def update(self,
            observations,
            actions,
            adv_n=None,
            acs_labels_na=None,
            qvals=None):
     # TODO: update the policy and return the loss
     # modified from https://pytorch.org/docs/stable/optim.html#taking-an-optimization-step
     self.optimizer.zero_grad()
     current_action = self.forward(ptu.from_numpy(observations))
     loss = self.loss(current_action, ptu.from_numpy(actions))
     loss.backward()
     self.optimizer.step()
     return {
         # You can add extra logging information here, but keep this line
         'Training Loss': ptu.to_numpy(loss),
     }
示例#28
0
    def get_action(self, obs: np.ndarray) -> np.ndarray:

        if len(obs.shape) > 1:
            observation = obs
        else:
            observation = obs[None]

        if self.discrete:
            action = self(ptu.from_numpy(observation)).sample()
        else:
            dist = self(ptu.from_numpy(observation))
            raw_action = dist.rsample()
            action = torch.tanh(raw_action)
            squash_action = torch.tanh(raw_action)
            action = (squash_action * self.action_scale + self.action_bias)

        return ptu.to_numpy(action)[0]
示例#29
0
    def get_action(self, obs):
        # MJ: changed the dimension check to a 3
        if len(obs.shape) > 3:
            observation = obs
        else:
            observation = obs[None]

        ## TODO return the action that maxinmizes the Q-value
        # at the current observation as the output
        # actions is actually (batch_size=1, )
        if not isinstance(observation, torch.Tensor):
            observation = ptu.from_numpy(observation)

        actions = ptu.to_numpy(
            self.critic.q_net(observation).argmax(dim=-1, keepdim=False))

        return actions.squeeze()
示例#30
0
    def update(self,
               observations,
               actions,
               adv_n=None,
               acs_labels_na=None,
               qvals=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        action_distribution = self(observations)
        predicted_actions = action_distribution.rsample()
        loss = self.loss(predicted_actions, actions)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {'Training Loss': ptu.to_numpy(loss)}