def get_action(self, step_number, curr_state_K, actions_taken_so_far, starting_fullenvstate, evaluating, take_exploratory_actions): """Select optimal action Args: curr_state_K: current "state" as known by the dynamics model actually a concatenation of (1) current obs, and (K-1) past obs step_number: which step number the rollout is currently on (used to calculate costs) actions_taken_so_far: used to restore state of the env to correct place, when using ground-truth dynamics starting_fullenvstate full state of env before this rollout, used for env resets (when using ground-truth dynamics) evaluating if True: default to not having any noise on the executing action take_exploratory_actions if True: select action based on disagreement of ensembles if False: (default) select action based on predicted costs Returns: best_action: optimal action to perform, according to this controller resulting_states_list: predicted results of executing the candidate action sequences """ ############################ ### sample N random candidate action sequences, each of length horizon ############################ np.random.seed() # to get different action samples for each rollout all_samples = [] junk = 1 for i in range(self.N): sample = [] for num in range(self.horizon): sample.append( self.rand_policy.get_action( junk, prev_action=None, random_sampling_params=self.random_sampling_params, hold_action_overrideToOne=True)[0]) all_samples.append(np.array(sample)) all_samples = np.array(all_samples) ######################################################################## ### make each action element be (past K actions) instead of just (curr action) ######################################################################## #all_samples : [N, horizon, ac_dim] all_acs = turn_acs_into_acsK(actions_taken_so_far, all_samples, self.K, self.N, self.horizon) #all_acs : [N, horizon, K, ac_dim] ############################ ### have model predict the result of executing those candidate action sequences ############################ if self.use_ground_truth_dynamics: paths = trajectory_sampler.sample_paths_parallel( self.N, all_samples, actions_taken_so_far, starting_fullenvstate, self.env, suppress_print=True, ) #list of dicts, each w observations/actions/etc. #the taken number of paths is num_cpu*(floor(self.N/num_cpu)) #rather than self.N, so update parameter accordingly self.N = len(paths) all_samples = all_samples[:self.N] resulting_states = [entry['observations'] for entry in paths] resulting_states = np.swapaxes(resulting_states, 0, 1) resulting_states_list = [resulting_states] else: resulting_states_list = self.dyn_models.do_forward_sim( [curr_state_K, 0], np.copy(all_acs)) resulting_states_list = np.swapaxes( resulting_states_list, 0, 1) #[ensSize, horizon+1, N, statesize] ############################ ### evaluate the predicted trajectories ############################ #calculate costs costs, mean_costs, std_costs = calculate_costs( resulting_states_list, all_samples, self.reward_func, evaluating, take_exploratory_actions) #pick best action sequence best_score = np.min(costs) best_sim_number = np.argmin(costs) best_sequence = all_samples[best_sim_number] best_action = np.copy(best_sequence[0]) ######################################### ### execute the candidate action sequences on the real dynamics ### instead of just on the model ### useful for debugging/analysis... ######################################### if self.execute_sideRollouts: if (step_number % self.horizon) == 0: cmap = plt.get_cmap('jet_r') num_colors = 10 ##5 indices_to_vis = [0, 1, 2] curr_plot = 1 num_plots = len(indices_to_vis) for index_state_to_vis in indices_to_vis: plt.subplot(num_plots, 1, curr_plot) for sim_num in range(num_colors): true_states = do_groundtruth_rollout( all_samples[sim_num], self.env, starting_fullenvstate, actions_taken_so_far) color = cmap(float(sim_num) / num_colors) if (self.iter_num == 0 and self.plot_sideRollouts): if (step_number % 10 == 0): plt.plot(resulting_states_list[-1] [:, sim_num, index_state_to_vis], '--', c=color, label=sim_num) plt.plot( np.array(true_states)[:, index_state_to_vis], '-', c=color) curr_plot += 1 plt.legend() plt.show() return best_action, resulting_states_list
def get_action(self, step_number, curr_state_K, actions_taken_so_far, starting_fullenvstate, evaluating, take_exploratory_actions): # init vars curr_state_K = np.array(curr_state_K) #[K, sa_dim] ## Debugging self.observation_history.append(curr_state_K[0]) if len(self.observation_history) > self.horizon + 1: self.observation_history.pop(0) # remove the 1st entry of mean (mean from past timestep, that was just executed) # and copy last entry (starting point, for the next timestep) past_action = self.mppi_mean[0].copy() self.mppi_mean[:-1] = self.mppi_mean[1:] ############################################## ## sample candidate action sequences ## by creating smooth filtered trajecs (noised around a mean) ############################################## np.random.seed() # to get different action samples for each rollout #sample noise from normal dist, scaled by sigma if(self.sample_velocity): eps_higherRange = np.random.normal( loc=0, scale=1.0, size=(self.N, self.horizon, self.ac_dim)) * self.sigma lowerRange = 0.3*self.sigma num_lowerRange = int(0.1*self.N) eps_lowerRange = np.random.normal( loc=0, scale=1.0, size=(num_lowerRange, self.horizon, self.ac_dim)) * lowerRange eps_higherRange[-num_lowerRange:] = eps_lowerRange eps=eps_higherRange.copy() else: eps = np.random.normal( loc=0, scale=1.0, size=(self.N, self.horizon, self.ac_dim)) * self.sigma # actions = mean + noise... then smooth the actions temporally all_samples = eps.copy() for i in range(self.horizon): if(i==0): all_samples[:, i, :] = self.beta*(self.mppi_mean[i, :] + eps[:, i, :]) + (1-self.beta)*past_action else: all_samples[:, i, :] = self.beta*(self.mppi_mean[i, :] + eps[:, i, :]) + (1-self.beta)*all_samples[:, i-1, :] # The resulting candidate action sequences: # all_samples : [N, horizon, ac_dim] all_samples = np.clip(all_samples, -1, 1) ######################################################################## ### make each action element be (past K actions) instead of just (curr action) ######################################################################## #all_samples : [N, horizon, ac_dim] #all_acs : [N, horizon, K, ac_dim] all_acs = turn_acs_into_acsK(actions_taken_so_far, all_samples, self.K, self.N, self.horizon) ################################################# ### Get result of executing those candidate action sequences ################################################# if self.use_ground_truth_dynamics: paths = trajectory_sampler.sample_paths_parallel( self.N, all_samples, actions_taken_so_far, starting_fullenvstate, self.env, suppress_print=True, ) #list of dicts, each w observations/actions/etc. #the taken number of paths is num_cpu*(floor(self.N/num_cpu)) #rather than self.N, so update parameter accordingly self.N = len(paths) all_samples = all_samples[:self.N] resulting_states = [entry['observations'] for entry in paths] resulting_states = np.swapaxes(resulting_states, 0, 1) resulting_states_list = [resulting_states] else: ################################################## ### Trajectory sampling edit, every self.traj_sampling_ratio is a new set of actions ################################################## all_acs = np.repeat(all_acs, self.traj_sampling_ratio, axis=0) resulting_states_list = self.dyn_models.do_forward_sim( [curr_state_K, 0], np.copy(all_acs)) resulting_states_list = np.swapaxes(resulting_states_list, 0,1) #[ensSize, horizon+1, N, statesize] ############################ ### evaluate the predicted trajectories ############################ # calculate costs [N,] use_catastrophe_in_prediction = self.catastrophe_pred and self.finetuning costs, mean_costs, std_costs = calculate_costs(resulting_states_list, all_samples, self.reward_func, evaluating, take_exploratory_actions, self.traj_sampling_ratio, use_catastrophe_in_prediction, self.risk_aversion_type, self.caution_beta) # uses all paths to update action mean (for horizon steps) # Note: mppi_update needs rewards, so pass in -costs selected_action = self.mppi_update(-costs, -mean_costs, std_costs, all_samples) ## Debugging prediction_horizon = 7 #self.prediction_history.append(np.array(resulting_states_list)[:, -(8 - prediction_horizon), :, -1]) #if len(self.prediction_history) > self.horizon + 1: # self.prediction_history.pop(0) self.action_history.append(np.copy(self.mppi_mean)) if len(self.action_history) > self.horizon + 1: self.action_history.pop(0) # debugging catastrophe prediction if len(self.observation_history) == self.horizon + 1: horizon_steps_ago_state = np.expand_dims(np.array(self.observation_history[-(prediction_horizon + 1)]), 0) selected_acs = np.expand_dims(np.expand_dims(self.action_history[-(prediction_horizon + 1)], 0), 2) resulting_final_actions_states_list = self.dyn_models.do_forward_sim( [horizon_steps_ago_state, 0], np.copy(selected_acs)) resulting_final_actions_states_list = np.array(resulting_final_actions_states_list)[prediction_horizon, :, :, -1] cat_pred = expit(resulting_final_actions_states_list) #cat_pred = np.where(cat_pred > 0.5, np.ones(cat_pred.shape), np.zeros(cat_pred.shape)) self.catastrophe_labels.append(self.observation_history[-1][-1]) #self.catastrophe_scores.append(self.prediction_history[self.horizon - prediction_horizon]) self.catastrophe_scores.append(cat_pred) #for i in range(len(self.observation_history)): # correct = self.observation_history[i][-1] == cat_pred # self.total_prediction_correct[i].append(correct.mean()) # if self.observation_history[i][-1] == 1: # positive_correct = np.mean(cat_pred[i] == 1) # self.positive_prediction_correct[0].append(positive_correct) # else: # negative_correct = np.mean(cat_pred[i] == 0) # self.negative_prediction_correct[i].append(negative_correct) ######################################### ### execute the candidate action sequences on the real dynamics ### instead of just on the model ### useful for debugging/analysis... ######################################### if self.execute_sideRollouts: if (step_number % self.horizon)==0: cmap = plt.get_cmap('jet_r') num_sims = 5 indices_to_vis = [0, 1, 2, 3, 4, 6, -3, -2] curr_plot = 1 num_plots = len(indices_to_vis) for index_state_to_vis in indices_to_vis: plt.subplot(num_plots, 1, curr_plot) plt.ylabel(index_state_to_vis) for sim_num in range(num_sims): true_states = do_groundtruth_rollout( all_samples[sim_num], self.env, starting_fullenvstate, actions_taken_so_far) color = cmap(float(sim_num) / num_sims) ###if(step_number%10==0): plt.plot( resulting_states_list[-1] [:, sim_num, index_state_to_vis], '--', c=color, label=sim_num) plt.plot( np.array(true_states)[:, index_state_to_vis], '-', c=color) curr_plot += 1 if self.plot_sideRollouts: plt.legend() plt.show() return selected_action, resulting_states_list
def get_action(self, step_number, curr_state_K, goal, actions_taken_so_far, starting_fullenvstate, evaluating, take_exploratory_actions): # init vars curr_state_K = np.array(curr_state_K) #[K, sa_dim] # remove the 1st entry of mean (mean from past timestep, that was just executed) # and copy last entry (starting point, for the next timestep) past_action = self.mppi_mean[0].copy() self.mppi_mean[:-1] = self.mppi_mean[1:] ############################################## ## sample candidate action sequences ## by creating smooth filtered trajecs (noised around a mean) ############################################## np.random.seed() # to get different action samples for each rollout #sample noise from normal dist, scaled by sigma if(self.sample_velocity): eps_higherRange = np.random.normal( loc=0, scale=1.0, size=(self.N, self.horizon, self.ac_dim)) * self.sigma lowerRange = 0.3*self.sigma num_lowerRange = int(0.1*self.N) eps_lowerRange = np.random.normal( loc=0, scale=1.0, size=(num_lowerRange, self.horizon, self.ac_dim)) * lowerRange eps_higherRange[-num_lowerRange:] = eps_lowerRange eps=eps_higherRange.copy() else: eps = np.random.normal( loc=0, scale=1.0, size=(self.N, self.horizon, self.ac_dim)) * self.sigma # actions = mean + noise... then smooth the actions temporally all_samples = eps.copy() for i in range(self.horizon): if(i==0): all_samples[:, i, :] = self.beta*(self.mppi_mean[i, :] + eps[:, i, :]) + (1-self.beta)*past_action else: all_samples[:, i, :] = self.beta*(self.mppi_mean[i, :] + eps[:, i, :]) + (1-self.beta)*all_samples[:, i-1, :] # The resulting candidate action sequences: # all_samples : [N, horizon, ac_dim] all_samples = np.clip(all_samples, -1, 1) ######################################################################## ### make each action element be (past K actions) instead of just (curr action) ######################################################################## #all_samples : [N, horizon, ac_dim] #all_acs : [N, horizon, K, ac_dim] all_acs = turn_acs_into_acsK(actions_taken_so_far, all_samples, self.K, self.N, self.horizon) ################################################# ### Get result of executing those candidate action sequences ################################################# if self.use_ground_truth_dynamics: paths = trajectory_sampler.sample_paths_parallel( self.N, all_samples, actions_taken_so_far, starting_fullenvstate, self.env, suppress_print=True, ) #list of dicts, each w observations/actions/etc. #the taken number of paths is num_cpu*(floor(self.N/num_cpu)) #rather than self.N, so update parameter accordingly self.N = len(paths) all_samples = all_samples[:self.N] resulting_states = [entry['observations'] for entry in paths] resulting_states = np.swapaxes(resulting_states, 0, 1) resulting_states_list = [resulting_states] else: resulting_states_list = self.dyn_models.do_forward_sim( [curr_state_K, 0], np.copy(all_acs)) resulting_states_list = np.swapaxes(resulting_states_list, 0,1) #[ensSize, horizon+1, N, statesize] ############################ ### evaluate the predicted trajectories ############################ # from ipdb import set_trace; # set_trace() # calculate costs [N,] costs, mean_costs, std_costs = calculate_costs(resulting_states_list, goal, all_samples, self.reward_func, evaluating, take_exploratory_actions) # print('mean_costs',mean_costs) # print('std_costs',std_costs) # uses all paths to update action mean (for horizon steps) # Note: mppi_update needs rewards, so pass in -costs selected_action = self.mppi_update(-costs, -mean_costs, std_costs, all_samples) ######################################### ### execute the candidate action sequences on the real dynamics ### instead of just on the model ### useful for debugging/analysis... ######################################### # if self.execute_sideRollouts: # if (step_number % self.horizon)==0: # cmap = plt.get_cmap('jet_r') # num_sims = 5 # indices_to_vis = [0, 1, 2, 3, 4, 6, -3, -2] # curr_plot = 1 # num_plots = len(indices_to_vis) # for index_state_to_vis in indices_to_vis: # plt.subplot(num_plots, 1, curr_plot) # plt.ylabel(index_state_to_vis) # for sim_num in range(num_sims): # true_states = do_groundtruth_rollout( # all_samples[sim_num], self.env, # starting_fullenvstate, actions_taken_so_far) # color = cmap(float(sim_num) / num_sims) # # ###if(step_number%10==0): # plt.plot( # resulting_states_list[-1] # [:, sim_num, index_state_to_vis], # '--', # c=color, # label=sim_num) # plt.plot( # np.array(true_states)[:, index_state_to_vis], # '-', # c=color) # curr_plot += 1 # # if self.plot_sideRollouts: # plt.legend() # plt.show() return selected_action, resulting_states_list
def get_action(self, step_number, curr_state_K, actions_taken_so_far, starting_fullenvstate, evaluating, take_exploratory_actions): """Select optimal action Args: curr_state_K: current "state" as known by the dynamics model actually a concatenation of (1) current obs, and (K-1) past obs step_number: which step number the rollout is currently on (used to calculate costs) actions_taken_so_far: used to restore state of the env to correct place, when using ground-truth dynamics starting_fullenvstate full state of env before this rollout, used for env resets (when using ground-truth dynamics) evaluating if True: default to not having any noise on the executing action take_exploratory_actions if True: select action based on disagreement of ensembles if False: (default) select action based on predicted costs Returns: best_action: optimal action to perform, according to this controller resulting_states_list: predicted results of executing the candidate action sequences """ #initial mean and var of the sampling normal dist mean = np.zeros((self.sol_dim, )) var = 5 * np.ones((self.sol_dim, )) X = stats.truncnorm(self.lb, self.ub, loc=np.zeros_like(mean), scale=np.ones_like(mean)) #stop if variance is very low, or if enough iters t = 0 while ((t < self.max_iters) and (np.max(var) > self.epsilon)): #variables lb_dist = mean - self.lb ub_dist = self.ub - mean constrained_var = np.minimum( np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var) #get samples all_samples_orig = X.rvs(size=[self.N, self.sol_dim]) * np.sqrt( constrained_var) + mean # [N, ac*h] all_samples = all_samples_orig.reshape( self.N, self.horizon, -1) #interpret each row as a sequence of actions all_samples = np.clip(all_samples, -1, 1) ######################################################################## ### make each action element be (past K actions) instead of just (curr action) ######################################################################## #all_samples is [N, horizon, ac_dim] all_acs = turn_acs_into_acsK(actions_taken_so_far, all_samples, self.K, self.N, self.horizon) #all_acs should now be [N, horizon, K, ac_dim] ############################ ### have model predict the result of executing those candidate action sequences ############################ if self.use_ground_truth_dynamics: paths = trajectory_sampler.sample_paths_parallel( self.N, all_samples, actions_taken_so_far, starting_fullenvstate, self.env, suppress_print=True, ) #list of dicts, each w observations/actions/etc. #the taken number of paths is num_cpu*(floor(self.N/num_cpu)) #rather than self.N, so update parameter accordingly self.N = len(paths) all_samples = all_samples[:self.N] resulting_states = [entry['observations'] for entry in paths] resulting_states = np.swapaxes(resulting_states, 0, 1) resulting_states_list = [resulting_states] else: resulting_states_list = self.dyn_models.do_forward_sim( [curr_state_K, 0], np.copy(all_acs)) resulting_states_list = np.swapaxes( resulting_states_list, 0, 1) #this is now [ensSize, horizon+1, N, statesize] ############################ ### evaluate the predicted trajectories ############################ #calculate costs : [N,] costs, mean_costs, std_costs = calculate_costs( resulting_states_list, all_samples, self.reward_func, evaluating, take_exploratory_actions) #pick elites, and refit mean/var #Note: these are costs, so pick the lowest few to be elites indices = np.argsort(costs) elites = all_samples_orig[indices][:self.num_elites] new_mean = np.mean(elites, axis=0) new_var = np.var(elites, axis=0) #interpolate between old mean and new one mean = self.alpha * mean + (1 - self.alpha) * new_mean var = self.alpha * var + (1 - self.alpha) * new_var #next iteration t += 1 #return the best action best_score = np.min(costs) best_sequence = mean.reshape( self.horizon, -1) #interpret the 'row' as a sequence of actions best_action = np.copy(best_sequence[0]) #(acDim,) ######################################### ### execute the candidate action sequences on the real dynamics ### instead of just on the model ### useful for debugging/analysis... ######################################### if self.execute_sideRollouts: if ((step_number % self.horizon) == 0): cmap = plt.get_cmap('jet_r') num_sims = 10 ##5 indices_to_vis = [0, 1, 2] curr_plot = 1 num_plots = len(indices_to_vis) for index_state_to_vis in indices_to_vis: plt.subplot(num_plots, 1, curr_plot) for sim_num in range(num_sims): true_states = do_groundtruth_rollout( all_samples[sim_num], self.env, starting_fullenvstate, actions_taken_so_far) color = cmap(float(sim_num) / num_sims) plt.plot(resulting_states_list[-1][:, sim_num, index_state_to_vis], '--', c=color, label=sim_num) plt.plot(np.array(true_states)[:, index_state_to_vis], '-', c=color) curr_plot += 1 if self.plot_sideRollouts: plt.legend() plt.show() return best_action, resulting_states_list