def __init__(self, env, num_init_random_rollouts=10, max_rollout_length=500, num_onplicy_iters=10, num_onpolicy_rollouts=10, training_epochs=60, training_batch_size=512, render=False, mpc_horizon=15, num_random_action_selection=4096, nn_layers=1): self._env = env self._max_rollout_length = max_rollout_length self._num_onpolicy_iters = num_onplicy_iters self._num_onpolicy_rollouts = num_onpolicy_rollouts self._training_epochs = training_epochs self._training_batch_size = training_batch_size self._render = render logger.info('Gathering random dataset') self._random_dataset = self._gather_rollouts(utils.RandomPolicy(env), num_init_random_rollouts) logger.info('Creating policy') self._policy = ModelBasedPolicy( env, self._random_dataset, horizon=mpc_horizon, num_random_action_selection=num_random_action_selection, nn_layers=nn_layers) timeit.reset() timeit.start('total')
def run_q1(self): """ Train on a dataset, and see how good the learned dynamics model's predictions are. implementation details: (i) Train using the self._random_dataset (ii) For each rollout, use the initial state and all actions to predict the future states. Store these predicted states in the pred_states list. NOTE: you should *not* be using any of the states in states[1:]. Only use states[0] (iii) After predicting the future states, we have provided plotting code that plots the actual vs predicted states and saves these to the experiment's folder. You do not need to modify this code. """ logger.info('Training policy....') ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError self._train_policy(self._random_dataset) logger.info('Evaluating predictions...') for r_num, (states, actions, _, _, _) in enumerate(self._random_dataset.rollout_iterator()): pred_states = [] ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError state_i = np.asarray(states[0]) action_i = np.asarray(actions[0]) pred_states.append(state_i) for i in range(len(states) - 1): pred_state_i = self._policy.predict(state_i, action_i) state_i = pred_state_i action_i = np.asarray(actions[i + 1]) pred_states.append(pred_state_i) states = np.asarray(states) pred_states = np.asarray(pred_states) state_dim = states.shape[1] rows = int(np.sqrt(state_dim)) cols = state_dim // rows f, axes = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows)) f.suptitle( 'Model predictions (red) versus ground truth (black) for open-loop predictions' ) for i, (ax, state_i, pred_state_i) in enumerate( zip(axes.ravel(), states.T, pred_states.T)): ax.set_title('state {0}'.format(i)) ax.plot(state_i, color='k') ax.plot(pred_state_i, color='r') plt.tight_layout() plt.subplots_adjust(top=0.90) f.savefig(os.path.join(logger.dir, 'prediction_{0:03d}.png'.format(r_num)), bbox_inches='tight') logger.info('All plots saved to folder')
def run_bonus_q3(self): """ Starting with the random dataset, train the policy on the dataset, gather rollouts with the policy, append the new rollouts to the existing dataset, and repeat """ dataset = self._random_dataset itr = -1 logger.info('Iteration {0}'.format(itr)) logger.record_tabular('Itr', itr) self._log(dataset) for itr in range(self._num_onpolicy_iters + 1): logger.info('Iteration {0}'.format(itr)) logger.record_tabular('Itr', itr) ### PROBLEM 3 ### YOUR CODE HERE logger.info('Training policy...') # raise NotImplementedError self._train_policy(dataset) ### PROBLEM 3 ### YOUR CODE HERE logger.info('Gathering rollouts...') # raise NotImplementedError new_dataset = self._gather_rollouts_cross_entropy( self._policy, self._num_onpolicy_rollouts) ### PROBLEM 3 ### YOUR CODE HERE logger.info('Appending dataset...') # raise NotImplementedError dataset.append(new_dataset) self._log(new_dataset)
def run_bonus_q2(self): """ Train the model-based policy on a random dataset, and evaluate the performance of the resulting policy """ logger.info('Random policy') self._log(self._random_dataset) logger.info('Training policy....') ### PROBLEM 2 ### YOUR CODE HERE # raise NotImplementedError self._train_policy(self._random_dataset) logger.info('Evaluating policy...') ### PROBLEM 2 ### YOUR CODE HERE # raise NotImplementedError eval_dataset = self._gather_rollouts_cross_entropy( self._policy, self._num_onpolicy_rollouts) logger.info('Trained policy') self._log(eval_dataset)
def run_test(self): logger.info('Training policy....') ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError self._train_policy(self._random_dataset) logger.info('Evaluating predictions...') for r_num, (states, actions, _, rewards, _) in enumerate(self._random_dataset.rollout_iterator()): pred_states = [] ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError state_i = np.asarray(states[0]) action_i = np.asarray(actions[0]) pred_states.append(state_i) for i in range(len(states) - 1): pred_state_i = self._policy.predict(state_i, action_i) state_i = pred_state_i action_i = np.asarray(actions[i + 1]) pred_states.append(pred_state_i) states = np.asarray(states) pred_states = np.asarray(pred_states) state_dim = states.shape[1] rows = int(np.sqrt(state_dim)) cols = state_dim // rows f, axes = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows)) f.suptitle( 'Model predictions (red) versus ground truth (black) for open-loop predictions' ) for i, (ax, state_i, pred_state_i) in enumerate( zip(axes.ravel(), states.T, pred_states.T)): ax.set_title('state {0}'.format(i)) ax.plot(state_i, color='k') ax.plot(pred_state_i, color='r') plt.tight_layout() plt.subplots_adjust(top=0.90) f.savefig(os.path.join(logger.dir, 'prediction_{0:03d}.png'.format(r_num)), bbox_inches='tight') logger.info('All plots saved to folder')