def update(self, observations, actions, next_observations, data_statistics): # DoneTODO(Q1) normalize the obs and acs above using the normalize function and data_statistics (same as above) norm_obs = normalize(np.squeeze(observations), data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(np.squeeze(actions), data_statistics['acs_mean'], data_statistics['acs_std']) pred_delta = self.delta_func( torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device)) # DoneTODO(Q1) Define a normalized true_delta using observations, next_observations and the delta stats from data_statistics true_delta = torch.Tensor( normalize(next_observations - observations, data_statistics['delta_mean'], data_statistics['delta_std'])).to(self.device) # DoneTODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and true change in state loss = nn.functional.mse_loss(true_delta, pred_delta) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
def define_forward_pass(self): # normalize input data to mean 0, std 1 obs_unnormalized = self.obs_pl acs_unnormalized = self.acs_pl # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines obs_normalized = normalize( obs_unnormalized, self.obs_mean_pl, self.obs_std_pl ) # TODO(Q1) Define obs_normalized using obs_unnormalized,and self.obs_mean_pl and self.obs_std_pl acs_normalized = normalize( acs_unnormalized, self.acs_mean_pl, self.acs_std_pl ) # TODO(Q2) Define acs_normalized using acs_unnormalized and self.acs_mean_pl and self.acs_std_pl # predicted change in obs concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1) # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s) self.delta_pred_normalized = build_mlp(concatenated_input, \ self.ob_dim, \ self.scope, \ self.n_layers, \ self.size) # TODO(Q1) Use the build_mlp function and the concatenated_input above to define a neural network that predicts unnormalized delta states (i.e. change in state) self.delta_pred_unnormalized = unnormalize( self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl ) # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl self.next_obs_pred = obs_unnormalized + self.delta_pred_unnormalized # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
def update(self, observations, actions, advantages, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss action_dist = self.forward(observations) log_pi = action_dist.log_prob(actions) print(observations.shape) loss = -torch.sum(log_pi * advantages) # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline: ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` if q_values is None: targets = utils.normalize(advantages, np.mean(q_values), np.std(q_values)) else: targets = utils.normalize(q_values, np.mean(q_values), np.std(q_values)) targets = ptu.from_numpy(targets) ## TODO: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = self.baseline.forward(observations).squeeze( 1) ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape, f"shapes do not match, pred_shape: " \ f" {baseline_predictions.shape} \t target shape {targets.shape}" # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = F.mse_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def forward( self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ obs_unnormalized = ptu.from_numpy(obs_unnormalized) acs_unnormalized = ptu.from_numpy(acs_unnormalized) self.update_statistics(obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std) #testing (512,4) # obs_test = obs_unnormalized.reshape((self.obs_mean.shape[0])) # obs_test_normalized = (obs_test - self.obs_mean) / self.obs_std # tmp_mean = torch.mean(obs_test_normalized) # tmp_std = torch.std(obs_test_normalized) # normalize input data to mean 0, std 1 obs_normalized = normalize(obs_unnormalized, self.obs_mean, self.obs_std) acs_normalized = normalize(acs_unnormalized, self.acs_mean, self.acs_std) # predicted change in obs # concatenated_input = torch.cat([obs_normalized.expand(acs_normalized.shape[0], -1), acs_normalized], dim=1) concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) # TODO(Q1) compute delta_pred_normalized and next_obs_pred # Hint: as described in the PDF, the output of the network is the # *normalized change* in state, i.e. normalized(s_t+1 - s_t). delta_pred_normalized = self.delta_network( concatenated_input) # TODO(Q1) delta_pred = unnormalize(delta_pred_normalized, self.delta_mean, self.delta_std) next_obs_pred = obs_unnormalized + delta_pred # TODO(Q1) return next_obs_pred, delta_pred_normalized
def _forward_delta_pred_normalized(self, observations, actions, data_statistics): obs_normalized = normalize( observations, data_statistics['obs_mean'], data_statistics['obs_std'] ) # TODO(Q1) Define obs_normalized using obs_unnormalized,and self.obs_mean_pl and self.obs_std_pl acs_normalized = normalize( actions, data_statistics['acs_mean'], data_statistics['acs_std'] ) # TODO(Q2) Define acs_normalized using acs_unnormalized and self.acs_mean_pl and self.acs_std_pl mlp_input = torch.cat([obs_normalized, acs_normalized], dim=1) return self.delta_pred_normalized(mlp_input)
def define_forward_pass(self): # normalize input data to mean 0, std 1 obs_unnormalized = self.obs_pl acs_unnormalized = self.acs_pl # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines obs_normalized = normalize(obs_unnormalized, self.obs_mean_pl, self.obs_std_pl) acs_normalized = normalize(acs_unnormalized, self.acs_mean_pl, self.acs_std_pl) # predicted change in obs concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1) # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s) self.delta_pred_normalized = build_mlp(concatenated_input, self.ob_dim, self.scope, self.n_layers, self.size) self.delta_pred_unnormalized = unnormalize(self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl) self.next_obs_pred = self.obs_pl + self.delta_pred_unnormalized
def forward( self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ # convert to tensors obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std = self.update_statistics(obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std) obs_unnormalized = ptu.from_numpy(obs_unnormalized) acs_unnormalized= ptu.from_numpy(acs_unnormalized) # normalize input data to mean 0, std 1 obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std)# TODO(Q1) acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std)# TODO(Q1) # predicted change in obs concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) # TODO(Q1) compute delta_pred_normalized and next_obs_pred # Hint: as described in the PDF, the output of the network is the # *normalized change* in state, i.e. normalized(s_t+1 - s_t). delta_pred_normalized = self.delta_network(concatenated_input)# TODO(Q1) next_obs_pred = obs_unnormalized + unnormalize(delta_pred_normalized, delta_mean, delta_std)# TODO(Q1) return next_obs_pred, delta_pred_normalized
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ # TODO(Q1) compute the normalized target for the model. target = normalize(next_observations-observations, data_statistics['delta_mean'], data_statistics['delta_std']) # Hint: you should use `data_statistics['delta_mean']` and # `data_statistics['delta_std']`, which keep track of the mean # and standard deviation of the model. pred, pred_normalized = self(observations, actions, **data_statistics) loss = self.loss(pred_normalized, ptu.from_numpy(target)) # TODO(Q1) compute the loss # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, next_observations, data_statistics): observations, actions, next_observations = observations.to( self.device), actions.to(self.device), next_observations.to( self.device) # normalize the labels delta_labels = next_observations - observations delta_labels_normalized = normalize( delta_labels, data_statistics['delta_mean'], data_statistics['delta_std'] ) # TODO(Q1) Define a normalized version of delta_labels using self.delta_labels (which are unnormalized), and self.delta_mean_pl and self.delta_std_pl delta_pred_normalized = self._forward_delta_pred_normalized( observations, actions, data_statistics) # compared predicted deltas to labels (both should be normalized) loss = self.mse_criterion( delta_labels_normalized, delta_pred_normalized ) # TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and ground truth change in state # train the model self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.detach().cpu()
def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # Estimate the advantage when nn_baseline is True, # by querying the neural network that you're using to learn the baseline if self.nn_baseline: baselines_unnormalized = self.actor.run_baseline_prediction(obs) ## ensure that the baseline and q_values have the same dimensionality ## to prevent silent broadcasting errors assert baselines_unnormalized.ndim == q_values.ndim ## baseline was trained with standardized q_values, so ensure that the predictions ## have the same mean and standard deviation as the current batch of q_values baselines = baselines_unnormalized * np.std(q_values) + np.mean( q_values) advantages = q_values - baselines # Else, just set the advantage to [Q] else: advantages = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: ## and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` advantages = normalize(advantages, advantages.mean(), advantages.std()) return advantages
def define_train_op(self): # normalize the labels self.delta_labels_normalized = normalize(self.delta_labels, self.delta_mean_pl, self.delta_std_pl) # compared predicted deltas to labels (both should be normalized) self.loss = tf.losses.mean_squared_error(self.delta_labels_normalized, self.delta_pred_normalized) self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
def get_prediction(self, obs, acs, data_statistics): if len(obs.shape) == 1 or len(acs.shape) == 1: obs = np.squeeze(obs)[None] acs = np.squeeze(acs)[None] norm_obs = normalize(obs, data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(acs, data_statistics['acs_mean'], data_statistics['acs_std']) norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device) norm_delta = self.delta_func(norm_input).cpu().detach().numpy() delta = unnormalize(norm_delta, data_statistics['delta_mean'], data_statistics['delta_std']) return obs + delta
def define_train_op(self): # normalize the labels self.delta_labels_normalized = normalize(self.delta_labels, self.delta_mean_pl, self.delta_std_pl)# TODO(Q1) Define a normalized version of delta_labels using self.delta_labels (which are unnormalized), and self.delta_mean_pl and self.delta_std_pl # compared predicted deltas to labels (both should be normalized) self.loss = tf.losses.mean_squared_error(self.delta_labels_normalized, self.delta_pred_normalized)# TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and ground truth change in state self.train_op = tf.train.AdamOptimizer(learning_rate= self.learning_rate).minimize(self.loss) # TODO(Q1) Define a train_op to minimize the loss defined above. Adam optimizer will work well.
def update(self, observations, actions, advantages, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO: compute the loss that should be optimized when training with policy gradient √ # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss # print('observations: ', observations) distribution = self.forward(observations) log_distribution: torch.Tensor = distribution.log_prob(actions) if not self.discrete: log_distribution = log_distribution.sum(1) # to fix the dimension problem assert log_distribution.size() == advantages.size() loss = - (log_distribution * advantages).sum() # TODO: optimize `loss` using `self.optimizer` √ # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline and q_values is not None: ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one √ ## HINT: there is a `normalize` function in `infrastructure.utils` targets = utils.normalize(q_values, q_values.mean(), q_values.std()) targets = ptu.from_numpy(targets) ## TODO: use the `forward` method of `self.baseline` to get baseline predictions √ baseline_predictions: torch.Tensor = self.baseline(observations).squeeze() ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = F.mse_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), 'Baseline Loss': ptu.to_numpy(baseline_loss), } # train_log['Baseline Loss'] = ptu.to_numpy(baseline_loss) return train_log
def update(self, observations, actions, advantages, q_values=None): # Not necessarily to conver to tensor type observations = tf.constant(observations, dtype=tf.float32) actions = tf.constant(actions, dtype=tf.float32) advantages = tf.constant(advantages, dtype=tf.float32) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self.policy_params) pi = self.forward(observations) logp = pi.log_prob(actions) loss = -tf.reduce_mean(logp * advantages) gradients = tape.gradient(loss, self.policy_params) self.optimizer.apply_gradients(zip(gradients, self.policy_params)) if self.nn_baseline: with tf.GradientTape() as tape: ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` targets = normalize(q_values, np.mean(q_values), np.std(q_values)) #targets = tf.Tensor(targets, dtype=tf.float32) ## TODO: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = self.baseline(observations) baseline_predictions = tf.squeeze( baseline_predictions) # Remove dimensions of size 1 ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = 0.5 * tf.keras.losses.mean_squared_error( baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first gradients = tape.gradient(baseline_loss, self.baseline.trainable_variables) self.baseline_optimizer.apply_gradients( zip(gradients, self.baseline.trainable_variables)) train_log = { 'Training Loss': -loss.numpy(), } return train_log
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) next_observations = ptu.from_numpy(next_observations) # Hint: you should use `data_statistics['delta_mean']` and # `data_statistics['delta_std']`, which keep track of the mean # and standard deviation of the model. self.update_statistics(*list(data_statistics.values())) # is it really needed?? # --seems not needed, this updates ff_model's data statistics, whilst statistics is already updated in mb_agent & MPC_Policy # experiment result shows: # ls -lh /git/py.code/hw4/homework_fall2020/hw4/cs285/scripts/../../data/hw4_q3_obstacles_obstacles-cs285-v0* # two curves are identical # so not needed data_statistics = { k: ptu.from_numpy(v) for k, v in data_statistics.items() } next_obs_pred, delta_pred_normalized = \ self.forward(observations, actions, data_statistics['obs_mean'], data_statistics['obs_std'], data_statistics['acs_mean'], data_statistics['acs_std'], data_statistics['delta_mean'], data_statistics['delta_std']) # TODO(Q1) done compute the normalized target for the model. target = normalize(next_observations - observations, data_statistics['delta_mean'], data_statistics['delta_std']) loss = self.loss(target, delta_pred_normalized) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def forward( # input and output are both tensors self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std) acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std) # predicted change in obs concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) # TODO(Q1) done compute delta_pred_normalized and next_obs_pred # Hint: as described in the PDF, the output of the network is the # *normalized change* in state, i.e. normalized(s_t+1 - s_t). delta_pred_normalized = self.delta_network(concatenated_input) next_obs_pred = unnormalize(delta_pred_normalized, delta_mean, delta_std) + obs_unnormalized return next_obs_pred, delta_pred_normalized
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ obs = ptu.from_numpy(observations) acs = ptu.from_numpy(actions) next_obs = ptu.from_numpy(next_observations) obs_mean = ptu.from_numpy(data_statistics["obs_mean"]) obs_std = ptu.from_numpy(data_statistics["obs_std"]) acs_mean = ptu.from_numpy(data_statistics["acs_mean"]) acs_std = ptu.from_numpy(data_statistics["acs_std"]) delta_mean = ptu.from_numpy(data_statistics["delta_mean"]) delta_std = ptu.from_numpy(data_statistics["delta_std"]) # compute the normalized target for the model. delta_target_unnormalized = next_obs - obs delta_target_normalized = normalize(delta_target_unnormalized, delta_mean, delta_std) # Hint: you should use `data_statistics['delta_mean']` and # `data_statistics['delta_std']`, which keep track of the mean # and standard deviation of the model. # compute the loss _, delta_pred_normalized = self( obs, acs, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ) loss = self.loss(delta_target_normalized, delta_pred_normalized) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { "Training Loss": ptu.to_numpy(loss), }
def get_prediction(self, obs, acs, data_statistics): if len(obs.shape) == 1 or len(acs.shape) == 1: obs = np.squeeze(obs)[None] acs = np.squeeze(acs)[None] # DoneTODO(Q1) normalize the obs and acs above using the normalize function and data_statistics norm_obs = normalize(obs, data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(acs, data_statistics['acs_mean'], data_statistics['acs_std']) norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device) norm_delta = self.delta_func(norm_input).cpu().detach().numpy() # DoneTODO(Q1) Unnormalize the the norm_delta above using the unnormalize function and data_statistics delta = unnormalize(norm_delta, data_statistics['delta_mean'], data_statistics['delta_std']) # DoneTODO(Q1) Return the predited next observation (You will use obs and delta) return obs + delta
def forward( self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ # normalize input data to mean 0, std 1 obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std) acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std) # predicted change in obs concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) delta_pred_normalized = self.delta_network(concatenated_input) next_obs_pred = unnormalize(delta_pred_normalized, delta_mean, delta_std) + obs_unnormalized return next_obs_pred, delta_pred_normalized
def update(self, observations, actions, advantages, n_rollouts=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss if self.discrete: actions = actions.to(torch.int64) # logits: (batch_size, seq_len, action_dim) logits = self.forward(observations) # log_pi: (batch_size, seq_len) log_pi = logits.gather(dim=-1, index=actions.unsqueeze( dim=-1)).squeeze(dim=-1) - logits.logsumexp(dim=-1, keepdim=False) else: acs_mean = self.forward(observations) # log_pi: (batch_size, seq_len, action_dim) log_pi = self.normal_dist.log_prob( normalize(data=actions, mean=acs_mean, std=torch.exp(self.logstd))) # log_pi: (batch_size, seq_len) log_pi = torch.sum(log_pi, dim=-1) assert log_pi.shape == advantages.shape loss = -torch.mean(torch.sum(log_pi * advantages, dim=-1), dim=0) if n_rollouts is not None and advantages.dim() == 1: # all rollouts are concatenated, manually divided by n_rollouts to get average log_pi /= n_rollouts # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) next_observations = ptu.from_numpy(next_observations) obs_mean = ptu.from_numpy(data_statistics['obs_mean']) obs_std = ptu.from_numpy(data_statistics['obs_std']) acs_mean = ptu.from_numpy(data_statistics['acs_mean']) acs_std = ptu.from_numpy(data_statistics['acs_std']) delta_mean = ptu.from_numpy(data_statistics['delta_mean']) delta_std = ptu.from_numpy(data_statistics['delta_std']) target = normalize(next_observations - observations, delta_mean, delta_std) # Hint: you should use `data_statistics['delta_mean']` and # `data_statistics['delta_std']`, which keep track of the mean # and standard deviation of the model. prediction_delta = self.forward(observations, actions, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std)[1] loss = self.loss(prediction_delta, target) # Hint: `self(...)` returns a tuple, but you only need to use one of the # outputs. self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def update(self, observations, actions, advantages, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # Maximize expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] loss = -torch.sum( self.forward(observations).log_prob(actions) * advantages) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline: ## normalize the q_values to have a mean of zero and a standard deviation of one targets = utils.normalize(q_values, np.mean(q_values), np.std(q_values)) targets = ptu.from_numpy(targets) ## use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = self.baseline.forward(observations) ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 baseline_predictions = torch.squeeze(baseline_predictions) assert baseline_predictions.shape == targets.shape # compute the loss that should be optimized for training the baseline MLP (`self.baseline`) baseline_loss = F.mse_loss(baseline_predictions, targets) self.optimizer.zero_grad() baseline_loss.backward() self.optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def update(self, observations, actions, **kwargs): pi = self.forward(observations) advantages = kwargs['advantages'] if 'advantages' in kwargs else 1.0 if self.discrete: log_prob = pi.log_prob(actions) else: log_prob = pi.log_prob(actions).sum(axis=-1) loss = torch.sum(-log_prob * advantages) self.optimizer.zero_grad() loss.backward() self.optimizer.step() train_log = { 'Training pi Loss': ptu.to_numpy(loss), } if self.nn_baseline: q_values = kwargs['q_values'] targets = normalize(q_values, q_values.mean(), q_values.std()) targets = ptu.from_numpy(targets) baseline_predictions = self.baseline(observations) baseline_predictions = baseline_predictions.squeeze() ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape baseline_loss = F.mse_loss(baseline_predictions, targets) # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log['Training v Loss'] = ptu.to_numpy(baseline_loss) return train_log
def update(self, observations, actions, next_observations, data_statistics): """ :param observations: numpy array of observations :param actions: numpy array of actions :param next_observations: numpy array of next observations :param data_statistics: A dictionary with the following keys (each with a numpy array as the value): - 'obs_mean' - 'obs_std' - 'acs_mean' - 'acs_std' - 'delta_mean' - 'delta_std' :return: """ observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) next_observations = ptu.from_numpy(next_observations) data_statistics = { k: ptu.from_numpy(v) for k, v in data_statistics.items() } target = normalize(next_observations - observations, data_statistics['delta_mean'], data_statistics['delta_std']) _, pred_delta = self.forward(observations, actions, **data_statistics) loss = self.loss(pred_delta, target) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): log = {} if self.t > self.num_exploration_steps: # TODO: After exploration is over, set the actor to optimize the extrinsic critic #HINT: Look at method ArgMaxPolicy.set_critic self.actor.set_critic(self.exploitation_critic) if (self.t > self.learning_starts and self.t % self.learning_freq == 0 and self.replay_buffer.can_sample(self.batch_size)): # Get Reward Weights # TODO: Get the current explore reward weight and exploit reward weight # using the schedule's passed in (see __init__) # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0 explore_weight = self.explore_weight_schedule.value(self.t) exploit_weight = self.exploit_weight_schedule.value(self.t) # Run Exploration Model # # TODO: Evaluate the exploration model on s' to get the exploration bonus # HINT: Normalize the exploration bonus, as RND values vary highly in magnitude prediction_error = self.exploration_model.forward_np(next_ob_no) expl_bonus = utils.normalize(prediction_error, prediction_error.mean(), prediction_error.std()) # Reward Calculations # # TODO: Calculate mixed rewards, which will be passed into the exploration critic # HINT: See doc for definition of mixed_reward mixed_reward = explore_weight * expl_bonus + exploit_weight * re_n # TODO: Calculate the environment reward # HINT: For part 1, env_reward is just 're_n' # After this, env_reward is 're_n' shifted by self.exploit_rew_shift, # and scaled by self.exploit_rew_scale env_reward = (re_n + self.exploit_rew_shift) * self.exploit_rew_scale # Update Critics And Exploration Model # # TODO 1): Update the exploration model (based off s') # TODO 2): Update the exploration critic (based off mixed_reward) # TODO 3): Update the exploitation critic (based off env_reward) expl_model_loss = self.exploration_model.update(next_ob_no) exploration_critic_loss = self.exploration_critic.update( ob_no, ac_na, next_ob_no, mixed_reward, terminal_n) exploitation_critic_loss = self.exploitation_critic.update( ob_no, ac_na, next_ob_no, env_reward, terminal_n) # Target Networks # if self.num_param_updates % self.target_update_freq == 0: # TODO: Update the exploitation and exploration target networks self.exploration_critic.update_target_network() self.exploitation_critic.update_target_network() # Logging # log['Exploration Critic Loss'] = exploration_critic_loss[ 'Training Loss'] log['Exploitation Critic Loss'] = exploitation_critic_loss[ 'Training Loss'] log['Exploration Model Loss'] = expl_model_loss # TODO: Uncomment these lines after completing cql_critic.py log['Exploitation Data q-values'] = exploitation_critic_loss[ 'Data q-values'] log['Exploitation OOD q-values'] = exploitation_critic_loss[ 'OOD q-values'] log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss'] self.num_param_updates += 1 self.t += 1 return log
def update(self, observations, actions, advantages, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO done: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss action_dist = self.forward(observations) if self.discrete: log_pi = action_dist.log_prob(actions) else: # distributions.Independent: # Reinterprets some of the batch dims of a distribution as event dims. # This is mainly useful for changing the shape of the result of log_prob. """from the experience from debugging, for lunarLander, action_dist.batch_shape = [5004] -> use it directly for invertedPendulum, action_dist.batch_shape = torch.Size([40070, 2]) -> use action_dist_new, whose batch_shape = 40070 """ if len(action_dist.batch_shape) == 1: log_pi = action_dist.log_prob(actions) else: action_dist_new = distributions.Independent(action_dist, 1) log_pi = action_dist_new.log_prob(actions) # sums = [entry * adv for entry in log_pi for adv in advantages] # sums = ptu.from_numpy(sums) # log pi can be inf if using multivariate normal # what if log_pi element size is not 1? # sums = torch.mul(log_pi, advantages) # ? is it the same as below? -- high chances that they are the same assert advantages.ndim == log_pi.ndim sums = advantages * log_pi # sums = torch.tensor(sums) # loss = sum(sums) loss = -torch.sum(sums) # `optimizer.step()` MINIMIZES a loss but we want to MAXIMIZE expectation # TODO done: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() ''' File "/home/hawk/dl/cs285/hw2/homework_fall2020/hw2/cs285/policies/MLP_policy.py", line 172, in update sums = torch.mul(log_pi, advantages) # ? is it the same as below? RuntimeError: The size of tensor a (2) must match the size of tensor b (40006) at non-singleton dimension 1 logged outputs to /home/hawk/dl/cs285/hw2/homework_fall2020/hw2/cs285/scripts/../../data/q3_b40000_r0.005_LunarLanderContinuous-v2_21-09-2020_10-16-04 ''' if self.nn_baseline: ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` targets = utils.normalize(q_values, np.mean(q_values), np.std(q_values)) targets = ptu.from_numpy(targets) ## TODO: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = self.baseline.forward(observations) ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 # TODO ? move squeeze into model.forward baseline_predictions = baseline_predictions.squeeze() assert baseline_predictions.shape == targets.shape, "{} vs {}".format(baseline_predictions.shape, targets.shape) # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = F.mse_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def update(self, observations, actions, advantages=None, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) #advantages=(Q_t - b_t) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE is the expectation over collected trajectories of: sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] #We max gradient of Cumulative rewards J(to take a step towards steepest direction) instead of J itself because it is hard to max J directly. # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `self.forward` method above # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss #compute log pi(a_t|s_t) log_pi = self.forward(observations).log_prob(actions) #use Back propagation tool to help us to compute policy gradient: the pseudo-loss is policy gradient without gradient -> the gradient of pseudo-loss is equal to the policy gradient. #the pseudo-loss is a weighted maximum likelihood, where the weight is advantages (reward to go with baseline), i.e., Q = q_value-baseline # use Minus - transform Gradient decent -> accent #it doesn't matter to use double mean or sum for tensor instead of mean and sum, because optimaser will adapt to it. #compute pseudo-loss sum_{t=0}^{T-1} [log pi(a_t|s_t) * (q_t - b_t)] loss = torch.neg(torch.mean(torch.mul(log_pi, advantages))) # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline: #most common choice of baseline is the on-policy value function V^pi(s_t) i.e., average return an agent gets if it starts in state s_t (Reward to go, i.e.,q_value) # TODO: normalize the q_values to have a mean of zero and a standard deviation of one ''' why normalize q_values first as targets of baseline? ''' ## HINT: there is a `normalize` function in `infrastructure.utils` targets = utils.normalize(q_values, np.mean(q_values), np.std(q_values)) targets = ptu.from_numpy(targets) # TODO: use the `forward` method of `self.baseline` to get baseline predictions #self.baseline is approximated by a neural network, which is updated concurrently with the policy baseline_predictions = self.baseline.forward( observations).squeeze() ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` #simplest method for learning baseline is minimize MSE. baseline_loss = self.baseline_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def update(self, observations, actions, adv_n=None): # TODO_: update the policy and return the loss # loss = TODO_ # return loss.item() observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(adv_n) # TODO_: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss if self.discrete : log_prob = self.forward(observations).log_prob(actions) else: log_prob = utils.multivariate_normal_diag(loc = self.forward(observations), scale_diag=torch.exp(self.logstd)).log_prob(actions) if self.nn_baseline: loss = torch.mean(log_prob * (torch.squeeze(self.baseline(observations)) - ptu.from_numpy(q_values))) else: loss = - 1.0 * torch.mean(log_prob * advantages) # import pdb; pdb.set_trace() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # TODO_: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first # TODO_ if self.nn_baseline: ## TODO_: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` targets = utils.normalize(q_values, 0, 1) targets = ptu.from_numpy(targets) ## TODO_: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = torch.squeeze(self.baseline(observations)) ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape # TODO_: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = self.baseline_loss(targets, baseline_predictions) # TODO_: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() return loss.item()
def define_train_op(self): # normalize the labels # TODO(Q1) Define a normalized version of delta_labels using self.delta_labels (which are unnormalized), and self.delta_mean_pl and self.delta_std_pl # DONE self.delta_labels_normalized = normalize(self.delta_labels, self.delta_mean_pl, self.delta_std_pl)