def eval_act_sequence(self, model, action_seq, observations, goals): """ Finds predicted trajectory for a given batch of ac_sequences on given initial obs and prev_obs vectors Arguments: model: the underlying dynamics model observations: dotmap:(N x), initial observations (state, state hist, act hist, latent hist) action_seq: (N x H x dotmap{}), action sequences per initial observation goals: should be shape (N, H+1, dO) or broadcastable Returns: dictionary{ctrl_seq, traj_seq, cost, } """ # TODO implement multiple particles # run the model forward on obs_start all_obs, all_mouts = rollout(self._env_spec, model, observations, action_seq, self._advance_obs_fn) # first unsqueezes and then concats all_obs = AttrDict.leaf_combine_and_apply( all_obs, func=lambda vs: torch.cat(vs, dim=1), map_func=lambda arr: arr.unsqueeze(1)) all_mouts = AttrDict.leaf_combine_and_apply( all_mouts, func=lambda vs: torch.cat(vs, dim=1), map_func=lambda arr: arr.unsqueeze(1)) costs = self._cost_fn(all_obs, goals, action_seq, all_mouts) return AttrDict( trajectory=all_obs, costs=costs # (N,) )
def eval_policy(dataset, save_file_name): b_size = dataset.batch_size d_size = len(dataset) obs_all = [] goals_all = [] output_actions = [] iters = math.ceil(d_size / b_size) for b in range(iters): logger.debug("[%d/%d]: Eval policy" % (b, iters)) idxs = np.arange(start=b * b_size, stop=min((b + 1) * b_size, d_size)) if args.random_goals: inputs, outputs = dataset.get_batch(indices=idxs, torch_device=model.device, get_horizon_goals=False) # this is to account for broadcasting to H+1 goals goals = env_spec.get_uniform( env_spec.goal_names, b_size, torch_device=model.device).unsqueeze(1) else: inputs, outputs, goals = dataset.get_batch( indices=idxs, torch_device=model.device, get_horizon_goals=True) # get obs batch obs = AttrDict() for name in env_spec.observation_names: obs[name] = inputs[name] act = policy.get_action(model, obs, goals, batch=True) goals_all.append(goals.leaf_apply(lambda v: to_numpy(v))) obs_all.append(obs.leaf_apply(lambda v: to_numpy(v))) output_actions.append(act.leaf_apply(lambda v: to_numpy(v))) # one big dictionary combined_obs = AttrDict.leaf_combine_and_apply( obs_all, lambda vs: np.concatenate(vs, axis=0)) combined_goals = AttrDict.leaf_combine_and_apply( goals_all, lambda vs: np.concatenate(vs, axis=0)) combined_output_actions = AttrDict.leaf_combine_and_apply( output_actions, lambda vs: np.concatenate(vs, axis=0)) combined_obs.combine(combined_goals) combined_obs.combine(combined_output_actions) logger.debug("Saving Action Sequences") savemat(save_file_name, combined_obs)
def _env_step(self, env, dataset, obs, goal): # TODO implement online training action = self._policy.get_action(self._model, obs, goal) next_obs, next_goal, done = env.step(action) self._curr_episode_obs = AttrDict.leaf_combine_and_apply( [self._curr_episode_obs, next_obs], lambda vs: vs[0] + [vs[1]]) self._curr_episode_actions = AttrDict.leaf_combine_and_apply( [self._curr_episode_actions, action], lambda vs: vs[0] + [vs[1]]) self._curr_episode_goals = AttrDict.leaf_combine_and_apply( [self._curr_episode_goals, next_goal], lambda vs: vs[0] + [vs[1]]) self._curr_episode_dones.append(done) if done: dataset.add_episode(self._curr_episode_obs, self._curr_episode_goals, self._curr_episode_actions, self._curr_episode_dones) self._reset_curr_episode() next_obs, next_goal = env.reset() return next_obs, next_goal
def eval_model(dataset, save_file_name): b_size = dataset.batch_size d_size = len(dataset) pred_trajectories = [] action_sequences = [] true_trajectories = [] costs = [] iters = math.ceil(d_size / b_size) for b in range(iters): logger.debug("[%d/%d]: Eval model" % (b, iters)) idxs = np.arange(start=b * b_size, stop=min((b + 1) * b_size, d_size)) inputs, outputs, goals = dataset.get_batch(indices=idxs, torch_device=model.device, get_horizon_goals=True, get_action_seq=True) # get obs batch obs = AttrDict() for name in env_spec.observation_names: obs[name] = inputs[name] act_seq = AttrDict() act_seq['act'] = inputs['act_seq'] model.eval() all_obs, all_mouts = rollout(env_spec, model, obs, act_seq, policy._advance_obs_fn) # first unsqueezes and then concats all_obs = AttrDict.leaf_combine_and_apply( all_obs, func=lambda vs: torch.cat(vs, dim=1), map_func=lambda arr: arr.unsqueeze(1)) all_mouts = AttrDict.leaf_combine_and_apply( all_mouts, func=lambda vs: torch.cat(vs, dim=1), map_func=lambda arr: arr.unsqueeze(1)) cost_dict = AttrDict( {'costs': policy._cost_fn(all_obs, goals, act_seq, all_mouts)}) true_trajectories.append(goals.leaf_apply(lambda v: to_numpy(v))) pred_trajectories.append(all_obs.leaf_apply(lambda v: to_numpy(v))) action_sequences.append(act_seq.leaf_apply(lambda v: to_numpy(v))) costs.append(cost_dict.leaf_apply(lambda v: to_numpy(v))) # one big dictionary final_dict = AttrDict.leaf_combine_and_apply( true_trajectories, lambda vs: np.concatenate(vs, axis=0)) combined_pred = AttrDict.leaf_combine_and_apply( pred_trajectories, lambda vs: np.concatenate(vs, axis=0)) combined_acts = AttrDict.leaf_combine_and_apply( action_sequences, lambda vs: np.concatenate(vs, axis=0)) combined_costs = AttrDict.leaf_combine_and_apply( costs, lambda vs: np.concatenate(vs, axis=0)) final_dict.combine(combined_pred) final_dict.combine(combined_acts) # no overlapping keys final_dict.combine(combined_costs) logger.debug("Saving Model Trajectories") logger.debug("Keys: " + str(final_dict.keys())) savemat(save_file_name, final_dict)
def get_action(self, model, observation, goal, batch=False): """Optimizes the cost function provided in setup(). Arguments: model: must be callable(action_sequence, observation, goal) and return cost (torch array) where action is at AttrDict consisting of keys only in self.action_names observation: {} goal: {goal_obs} where goal_obs must be N x H+1 x ... batch: Returns: AttrDict with {action_sequence, results {costs, order} } """ # generate random sequence batch_size = goal.goal_obs.shape[0] # requires goal_obs to be a key device = goal.goal_obs.device if not batch: observation = observation.leaf_apply(lambda arr: arr.unsqueeze( 0).repeat_interleave(self._pop_size, dim=0)) goal = goal.leaf_apply(lambda arr: arr.unsqueeze(0). repeat_interleave(self._pop_size, dim=0)) else: observation = observation.leaf_apply( lambda arr: arr.repeat_interleave(self._pop_size, dim=0)) goal = goal.leaf_apply( lambda arr: arr.repeat_interleave(self._pop_size, dim=0)) action_sequence = self._env_spec.get_uniform( self._action_names, batch_size=batch_size * self._pop_size * self._horizon) action_sequence.leaf_modify(lambda x: split_dim( torch.from_numpy(x).to(device), dim=0, new_shape=[batch_size * self._pop_size, self._horizon])) def resample_and_flatten(vs): old_acseq = vs[0] mean, std = vs[1], vs[2] sample = torch.randn_like(old_acseq) * std + mean d = AttrDict(act=sample) self._env_spec.clip(d, ['act']) return d.act.view([-1] + list(old_acseq.shape[2:])) best_initial_act = None results = None for i in range(self._max_iters): # run the model results = model(action_sequence, observation, goal) # view as (B, Pop, ...) action_sequence.leaf_modify( lambda x: split_dim(x, 0, [batch_size, self._pop_size])) results.leaf_modify( lambda x: split_dim(x, 0, [batch_size, self._pop_size])) results.order = torch.argsort(results.costs, dim=1) # lowest to highest best = results.order[:, :self._num_elites] best = best.unsqueeze(-1).unsqueeze(-1).expand( (-1, -1, self._horizon, self._act_dim)) best_act_seq = action_sequence.leaf_apply( lambda x: torch.gather(x, 1, best)) best_initial_act = best_act_seq.leaf_apply( lambda x: x[:, 0, 0]) # where x is (B, Pop, H ...) means = best_act_seq.leaf_apply(lambda x: x.mean(1, keepdim=True)) stds = best_act_seq.leaf_apply(lambda x: x.std(1, keepdim=True)) if i < self._max_iters - 1: # resampling action_sequence = AttrDict.leaf_combine_and_apply( [action_sequence, means, stds], resample_and_flatten) # act is (B, actdim) best_initial_act.action_sequence = action_sequence # (B*Pop, horizon, actdim) best_initial_act.results = results return best_initial_act