def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = obs else: observation = obs[None] # TODO return the action that the policy prescribes self.eval() observation = ptu.from_numpy(observation) action = self(observation) return ptu.to_numpy(action)
def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = ptu.from_numpy(obs) else: observation = ptu.from_numpy(obs[None]) # return the action that the policy prescribes if self.discrete: return self.logits_na(observation).argmax().item() else: return ptu.to_numpy(self.mean_net(observation)) raise NotImplementedError
def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = obs else: observation = obs[None] # TODO return the action that the policy prescribes prediction = self(ptu.from_numpy(observation)) if self.discrete: res = torch.argmax(prediction) else: res = prediction.rsample() return ptu.to_numpy(res)
def run_baseline_prediction(self, obs): """ Helper function that converts `obs` to a tensor, calls the forward method of the baseline MLP, and returns a np array Input: `obs`: np.ndarray of size [N, 1] Output: np.ndarray of size [N] """ obs = ptu.from_numpy(obs) predictions = self.baseline(obs) return ptu.to_numpy(predictions)[:, 0]
def update(self, ob_no, ac_na, reward_n, next_ob_no, terminal_n): """ Update the parameters of the critic. let sum_of_path_lengths be the sum of the lengths of the paths sampled from Agent.sample_trajectories let num_paths be the number of paths sampled from Agent.sample_trajectories arguments: ob_no: shape: (sum_of_path_lengths, ob_dim) next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing the reward for each timestep terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended at that timestep of 0 if the episode did not end returns: nothing """ ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = ptu.from_numpy(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) qa_t_values = self.q_net(ob_no) q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) qa_tp1_values = self.q_net_target(next_ob_no) if self.double_q: acs = self.q_net(next_ob_no).argmax(dim=1) q_tp1 = torch.gather(qa_tp1_values, 1, acs.unsqueeze(1)).squeeze(1) else: q_tp1, _ = qa_tp1_values.max(dim=1) # HINT: as you saw in lecture, this would be: #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal) target = reward_n + self.gamma * q_tp1 * (1.0 - terminal_n) target = target.detach() assert q_t_values.shape == target.shape loss = self.loss(q_t_values, target) self.optimizer.zero_grad() loss.backward() utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping) self.optimizer.step() self.learning_rate_scheduler.step() return { 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, advantages, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss dist = self(observations) loss = -torch.mean(dist.log_prob(actions) * advantages) # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline: ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` mean = np.mean(q_values) std = np.std(q_values) targets = normalize(q_values, mean, std) targets = ptu.from_numpy(targets) ## TODO: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = torch.squeeze(self.baseline(observations)) ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = self.baseline_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def forward(self, ob_no): if self.hash: # print(ob_no) codes = ptu.to_numpy(self.encoder(ob_no).round()) counts = np.zeros(len(codes)) for i, code in enumerate(codes): counts[i] = self.counts[str(code)] self.counts[str(code)] += 1 return 1 / np.sqrt(counts+1) # TODO: Get the prediction error for ob_no # HINT: Remember to detach the output of self.f! else: error = ((self.f.forward(ob_no).detach() - self.f_hat(ob_no)) ** 2).mean(axis=1) return error
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ # mean = np.mean(data_statistics['delta_mean']) # std = np.std(data_statistics['delta_std']) # target = normalize(next_observations-observations, mean, std)# TODO(Q1) compute the normalized target for the model. # target = ptu.from_numpy(target) # Hint: you should use `data_statistics['delta_mean']` and # `data_statistics['delta_std']`, which keep track of the mean # and standard deviation of the model. obs = ptu.from_numpy(observations) acs = ptu.from_numpy(actions) # obs_mean = ptu.from_numpy(data_statistics['obs_mean']) # obs_std = ptu.from_numpy(data_statistics['obs_std']) # acs_mean = ptu.from_numpy(data_statistics['acs_mean']) # acs_std = ptu.from_numpy(data_statistics['acs_std']) # delta_mean = ptu.from_numpy(data_statistics['delta_mean']) # delta_std = ptu.from_numpy(data_statistics['delta_std']) self.update_statistics(*data_statistics.values()) target = normalize( ptu.from_numpy(next_observations - observations), self.delta_mean, self.delta_std ) # TODO(Q1) compute the normalized target for the model. pred = self(obs, acs, self.obs_mean, self.obs_std, self.acs_mean, self.acs_std, self.delta_mean, self.delta_std)[1] loss = self.loss(pred, target) # TODO(Q1) compute the loss # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ # print ('obs', type(observations)) target = ptu.from_numpy( ((next_observations - observations) - data_statistics['delta_mean']) / data_statistics['delta_std'] ) # TODO(Q1) compute the normalized target for the model. # Hint: you should use `data_statistics['delta_mean']` and # `data_statistics['delta_std']`, which keep track of the mean # and standard deviation of the model. self.update_statistics(data_statistics['obs_mean'], data_statistics['obs_std'], data_statistics['acs_mean'], data_statistics['acs_std'], data_statistics['delta_mean'], data_statistics['delta_std']) # print (type(observations)) _, output = self.forward(ptu.from_numpy(observations), ptu.from_numpy(actions), self.obs_mean, self.obs_std, self.acs_mean, self.acs_std, self.delta_mean, self.delta_std) loss = self.loss(target, output) # TODO(Q1) compute the loss # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def step_env(self): """ Step the env and store the transition At the end of this block of code, the simulator should have been advanced one step, and the replay buffer should contain one more transition. Note that self.last_obs must always point to the new latest observation. """ # TODO store the latest observation ("frame") into the replay buffer # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer` # in dqn_utils.py self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs) # todo: figure out how to use self.store_effect eps = self.exploration.value(self.t) # TODO use epsilon greedy exploration when selecting action perform_random_action = (np.random.random() < eps) or (self.t < self.learning_starts) if perform_random_action: # HINT: take random action # with probability eps (see np.random.random()) # OR if your current step number (see self.t) is less that self.learning_starts action = self.env.action_space.sample() # note: is this correct?? else: # HINT: Your actor will take in multiple previous observations ("frames") in order # to deal with the partial observability of the environment. Get the most recent # `frame_history_len` observations using functionality from the replay buffer, # and then use those observations as input to your actor. frames = self.replay_buffer.encode_recent_observation() action = ptu.to_numpy(self.actor.get_action(frames)) # TODO take a step in the environment using the action from the policy # HINT1: remember that self.last_obs must always point to the newest/latest observation # HINT2: remember the following useful function that you've seen before: #obs, reward, done, info = env.step(action) obs, reward, done, info = self.env.step(action) # TODO store the result of taking this action into the replay buffer # HINT1: see your replay buffer's `store_effect` function # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) # TODO if taking this step resulted in done, reset the env (and the latest observation) if done: self.last_obs = self.env.reset() else: self.last_obs = obs
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ # target = (next_observations - data_statistics["obs_mean"]) / data_statistics["obs_std"]# TODO(Q1) compute the normalized target for the model. delta_mean = ptu.from_numpy(data_statistics["delta_mean"]).to( ptu.device) delta_std = ptu.from_numpy(data_statistics["delta_std"]).to(ptu.device) target = ptu.from_numpy(next_observations - observations).to( ptu.device) target = (target - delta_mean) / delta_std # Hint: you should use `data_statistics['delta_mean']` and # `data_statistics['delta_std']`, which keep track of the mean # and standard deviation of the model. obs_mean, obs_std = data_statistics["obs_mean"], data_statistics[ "obs_std"] acs_mean, acs_std = data_statistics["acs_mean"], data_statistics[ "acs_std"] obs_normalized = (observations - obs_mean) / obs_std # TODO(Q1) acs_normalized = (actions - acs_mean) / acs_std # TODO(Q1) obs_normalized = ptu.from_numpy(obs_normalized).to(ptu.device) acs_normalized = ptu.from_numpy(acs_normalized).to(ptu.device) concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) y_hat = self.delta_network(concatenated_input) loss = self.loss(y_hat, target) # TODO(Q1) compute the loss # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): loss = self.loss(self(observations), actions) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def get_action(self, obs: np.ndarray) -> np.ndarray: # with torch.no_grad(): obs = np.asarray(obs) if len(obs.shape) > 1: observation = obs else: observation = obs[None] # # Done todo return the action that the policy prescribes # # print('selfdiscrete', self.discrete) if self.discrete: observation = ptu.from_numpy(obs) possible_actions = self.logits_na(observation) m = torch.distributions.categorical.Categorical( logits=possible_actions) action_to_take = m.sample() return ptu.to_numpy(action_to_take) else: obs = ptu.from_numpy(obs) pred_mu = self.mean_net(obs) std = torch.exp(self.logstd) eps = torch.randn_like(pred_mu) pred = pred_mu + eps * std return ptu.to_numpy(pred)
def get_action(self, obs: np.ndarray) -> np.ndarray: # this obs has a dim of ob_dim + ac_dim*(1+MAX_CAND_NUM) if len(obs.shape) > 2: observation = obs else: observation = obs[None] observation = ptu.from_numpy(observation) action_distributions = self(observation) actions = [] for i in range(self.n_drivers): action_i = ptu.to_numpy( action_distributions[i].sample()) # don't bother with rsample actions.append(action_i) actions = np.array(actions) return actions.T
def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = obs else: observation = obs[None] # TODO return the action that the policy prescribes #Args:obs (numpy.ndarray): Observation from environment. #Returns:numpy.ndarray: Predicted action by forward NN. Note:return numpy array instead of tensor, may because np is more general action_distribution = self.forward(ptu.from_numpy(observation)) action = action_distribution.sample() return ptu.to_numpy(action)
def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = obs else: observation = obs[None] # TODO return the action that the policy prescribes if self.discrete: action = self(ptu.from_numpy(observation)).sample() else: action = self(ptu.from_numpy(observation)).rsample() return ptu.to_numpy(action)
def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = obs else: observation = obs[None] # TODO return the action that the policy prescribes observation = ptu.from_numpy(observation) if self.discrete: action = self.logits_na(observation) action = torch.argmax(action, dim=1) else: action = self.mean_net(observation) return ptu.to_numpy(action)
def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = obs else: observation = obs[None] # DONE: return the action that the policy prescribes # Cast to PyTorch tensor observation = ptu.from_numpy(observation) if self.discrete: action = self.forward(observation) action = action.sample() else: action = self.forward(observation) return (ptu.to_numpy(action))
def get_action(self, obs: np.ndarray) -> np.ndarray: # TODO: get this from hw1 if len(obs.shape) > 1: observation = obs else: observation = obs[None] # TODO return the action that the policy prescribes self.eval() observation = ptu.from_numpy(observation) action_distribution = self(observation) action = action_distribution.sample() return ptu.to_numpy(action)
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss loss = super().update(observations, actions, adv_n=None, acs_labels_na=None, qvals=None) return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss actions_pred = self.get_action(observations) loss = self.loss(actions_pred, actions) loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss self.optimizer.zero_grad() current_action = self.forward(ptu.from_numpy(observations)) loss = self.loss(current_action, ptu.from_numpy(actions)) loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def forward( self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ # normalize input data to mean 0, std 1 # obs_normalized = # TODO(Q1) ------------------- # acs_normalized = # TODO(Q1) ------------------- obs_normalized = ptu.from_numpy( normalize(obs_unnormalized, obs_mean, obs_std)) acs_normalized = ptu.from_numpy( normalize(acs_unnormalized, acs_mean, acs_std)) # predicted change in obs concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) # TODO(Q1) compute delta_pred_normalized and next_obs_pred -------------------- # Hint: as described in the PDF, the output of the network is the # *normalized change* in state, i.e. normalized(s_t+1 - s_t). # delta_pred_normalized = # TODO(Q1) --------------------- delta_pred_normalized = self.delta_network(concatenated_input) # next_obs_pred = # TODO(Q1) -------------------- next_obs_pred = ptu.from_numpy(obs_unnormalized) + ptu.from_numpy( unnormalize(ptu.to_numpy(delta_pred_normalized), delta_mean, delta_std)) return next_obs_pred, delta_pred_normalized
def update(self, observations, actions, advantages, n_rollouts=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss if self.discrete: actions = actions.to(torch.int64) # logits: (batch_size, seq_len, action_dim) logits = self.forward(observations) # log_pi: (batch_size, seq_len) log_pi = logits.gather(dim=-1, index=actions.unsqueeze( dim=-1)).squeeze(dim=-1) - logits.logsumexp(dim=-1, keepdim=False) else: acs_mean = self.forward(observations) # log_pi: (batch_size, seq_len, action_dim) log_pi = self.normal_dist.log_prob( normalize(data=actions, mean=acs_mean, std=torch.exp(self.logstd))) # log_pi: (batch_size, seq_len) log_pi = torch.sum(log_pi, dim=-1) assert log_pi.shape == advantages.shape loss = -torch.mean(torch.sum(log_pi * advantages, dim=-1), dim=0) if n_rollouts is not None and advantages.dim() == 1: # all rollouts are concatenated, manually divided by n_rollouts to get average log_pi /= n_rollouts # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ target = ptu.from_numpy( normalize( data=next_observations - observations, mean=data_statistics['delta_mean'], std=data_statistics['delta_std'], )) # TODO(Q1) compute the normalized target for the model. # Hint: you should use `data_statistics['delta_mean']` and # `data_statistics['delta_std']`, which keep track of the mean # and standard deviation of the model. loss = self.loss( self.forward( obs_unnormalized=observations, acs_unnormalized=actions, obs_mean=data_statistics['obs_mean'], obs_std=data_statistics['obs_std'], acs_mean=data_statistics['acs_mean'], acs_std=data_statistics['acs_std'], delta_mean=data_statistics['delta_mean'], delta_std=data_statistics['delta_std'], )[1], target) # TODO(Q1) compute the loss # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss loss = self.loss( torch.from_numpy(actions).type(torch.float32), self.mean_net(torch.from_numpy(observations).type(torch.float32))) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): # TODO: update the policy and return the loss # modified from https://pytorch.org/docs/stable/optim.html#taking-an-optimization-step self.optimizer.zero_grad() current_action = self.forward(ptu.from_numpy(observations)) loss = self.loss(current_action, ptu.from_numpy(actions)) loss.backward() self.optimizer.step() return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), }
def get_action(self, obs: np.ndarray) -> np.ndarray: if len(obs.shape) > 1: observation = obs else: observation = obs[None] if self.discrete: action = self(ptu.from_numpy(observation)).sample() else: dist = self(ptu.from_numpy(observation)) raw_action = dist.rsample() action = torch.tanh(raw_action) squash_action = torch.tanh(raw_action) action = (squash_action * self.action_scale + self.action_bias) return ptu.to_numpy(action)[0]
def get_action(self, obs): # MJ: changed the dimension check to a 3 if len(obs.shape) > 3: observation = obs else: observation = obs[None] ## TODO return the action that maxinmizes the Q-value # at the current observation as the output # actions is actually (batch_size=1, ) if not isinstance(observation, torch.Tensor): observation = ptu.from_numpy(observation) actions = ptu.to_numpy( self.critic.q_net(observation).argmax(dim=-1, keepdim=False)) return actions.squeeze()
def update(self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) action_distribution = self(observations) predicted_actions = action_distribution.rsample() loss = self.loss(predicted_actions, actions) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return {'Training Loss': ptu.to_numpy(loss)}