예제 #1
0
    def eval_act_sequence(self, model, action_seq, observations, goals):
        """ Finds predicted trajectory for a given batch of ac_sequences on given initial obs and prev_obs vectors
        Arguments:
            model: the underlying dynamics model
            observations: dotmap:(N x), initial observations (state, state hist, act hist, latent hist)
            action_seq: (N x H x dotmap{}), action sequences per initial observation
            goals: should be shape (N, H+1, dO) or broadcastable
        Returns: dictionary{ctrl_seq, traj_seq, cost, }
        """

        # TODO implement multiple particles

        # run the model forward on obs_start
        all_obs, all_mouts = rollout(self._env_spec, model, observations,
                                     action_seq, self._advance_obs_fn)

        # first unsqueezes and then concats
        all_obs = AttrDict.leaf_combine_and_apply(
            all_obs,
            func=lambda vs: torch.cat(vs, dim=1),
            map_func=lambda arr: arr.unsqueeze(1))
        all_mouts = AttrDict.leaf_combine_and_apply(
            all_mouts,
            func=lambda vs: torch.cat(vs, dim=1),
            map_func=lambda arr: arr.unsqueeze(1))
        costs = self._cost_fn(all_obs, goals, action_seq, all_mouts)

        return AttrDict(
            trajectory=all_obs,
            costs=costs  # (N,)
        )
예제 #2
0
def eval_policy(dataset, save_file_name):
    b_size = dataset.batch_size
    d_size = len(dataset)

    obs_all = []
    goals_all = []
    output_actions = []
    iters = math.ceil(d_size / b_size)
    for b in range(iters):
        logger.debug("[%d/%d]: Eval policy" % (b, iters))
        idxs = np.arange(start=b * b_size, stop=min((b + 1) * b_size, d_size))
        if args.random_goals:
            inputs, outputs = dataset.get_batch(indices=idxs,
                                                torch_device=model.device,
                                                get_horizon_goals=False)
            # this is to account for broadcasting to H+1 goals
            goals = env_spec.get_uniform(
                env_spec.goal_names, b_size,
                torch_device=model.device).unsqueeze(1)
        else:
            inputs, outputs, goals = dataset.get_batch(
                indices=idxs,
                torch_device=model.device,
                get_horizon_goals=True)

        # get obs batch
        obs = AttrDict()
        for name in env_spec.observation_names:
            obs[name] = inputs[name]

        act = policy.get_action(model, obs, goals, batch=True)

        goals_all.append(goals.leaf_apply(lambda v: to_numpy(v)))
        obs_all.append(obs.leaf_apply(lambda v: to_numpy(v)))
        output_actions.append(act.leaf_apply(lambda v: to_numpy(v)))

    # one big dictionary
    combined_obs = AttrDict.leaf_combine_and_apply(
        obs_all, lambda vs: np.concatenate(vs, axis=0))
    combined_goals = AttrDict.leaf_combine_and_apply(
        goals_all, lambda vs: np.concatenate(vs, axis=0))
    combined_output_actions = AttrDict.leaf_combine_and_apply(
        output_actions, lambda vs: np.concatenate(vs, axis=0))

    combined_obs.combine(combined_goals)
    combined_obs.combine(combined_output_actions)

    logger.debug("Saving Action Sequences")
    savemat(save_file_name, combined_obs)
 def _env_step(self, env, dataset, obs, goal):
     # TODO implement online training
     action = self._policy.get_action(self._model, obs, goal)
     next_obs, next_goal, done = env.step(action)
     self._curr_episode_obs = AttrDict.leaf_combine_and_apply(
         [self._curr_episode_obs, next_obs], lambda vs: vs[0] + [vs[1]])
     self._curr_episode_actions = AttrDict.leaf_combine_and_apply(
         [self._curr_episode_actions, action], lambda vs: vs[0] + [vs[1]])
     self._curr_episode_goals = AttrDict.leaf_combine_and_apply(
         [self._curr_episode_goals, next_goal], lambda vs: vs[0] + [vs[1]])
     self._curr_episode_dones.append(done)
     if done:
         dataset.add_episode(self._curr_episode_obs, self._curr_episode_goals, self._curr_episode_actions,
                             self._curr_episode_dones)
         self._reset_curr_episode()
         next_obs, next_goal = env.reset()
     return next_obs, next_goal
def eval_model(dataset, save_file_name):
    b_size = dataset.batch_size
    d_size = len(dataset)

    pred_trajectories = []
    action_sequences = []
    true_trajectories = []
    costs = []

    iters = math.ceil(d_size / b_size)
    for b in range(iters):
        logger.debug("[%d/%d]: Eval model" % (b, iters))
        idxs = np.arange(start=b * b_size, stop=min((b + 1) * b_size, d_size))
        inputs, outputs, goals = dataset.get_batch(indices=idxs,
                                                   torch_device=model.device,
                                                   get_horizon_goals=True,
                                                   get_action_seq=True)

        # get obs batch
        obs = AttrDict()
        for name in env_spec.observation_names:
            obs[name] = inputs[name]

        act_seq = AttrDict()
        act_seq['act'] = inputs['act_seq']

        model.eval()
        all_obs, all_mouts = rollout(env_spec, model, obs, act_seq,
                                     policy._advance_obs_fn)

        # first unsqueezes and then concats
        all_obs = AttrDict.leaf_combine_and_apply(
            all_obs,
            func=lambda vs: torch.cat(vs, dim=1),
            map_func=lambda arr: arr.unsqueeze(1))
        all_mouts = AttrDict.leaf_combine_and_apply(
            all_mouts,
            func=lambda vs: torch.cat(vs, dim=1),
            map_func=lambda arr: arr.unsqueeze(1))

        cost_dict = AttrDict(
            {'costs': policy._cost_fn(all_obs, goals, act_seq, all_mouts)})

        true_trajectories.append(goals.leaf_apply(lambda v: to_numpy(v)))
        pred_trajectories.append(all_obs.leaf_apply(lambda v: to_numpy(v)))
        action_sequences.append(act_seq.leaf_apply(lambda v: to_numpy(v)))
        costs.append(cost_dict.leaf_apply(lambda v: to_numpy(v)))

    # one big dictionary
    final_dict = AttrDict.leaf_combine_and_apply(
        true_trajectories, lambda vs: np.concatenate(vs, axis=0))
    combined_pred = AttrDict.leaf_combine_and_apply(
        pred_trajectories, lambda vs: np.concatenate(vs, axis=0))
    combined_acts = AttrDict.leaf_combine_and_apply(
        action_sequences, lambda vs: np.concatenate(vs, axis=0))
    combined_costs = AttrDict.leaf_combine_and_apply(
        costs, lambda vs: np.concatenate(vs, axis=0))

    final_dict.combine(combined_pred)
    final_dict.combine(combined_acts)  # no overlapping keys
    final_dict.combine(combined_costs)

    logger.debug("Saving Model Trajectories")
    logger.debug("Keys: " + str(final_dict.keys()))
    savemat(save_file_name, final_dict)
예제 #5
0
    def get_action(self, model, observation, goal, batch=False):
        """Optimizes the cost function provided in setup().
        Arguments:
            model: must be callable(action_sequence, observation, goal) and return cost (torch array)
                    where action is at AttrDict consisting of keys only in self.action_names
            observation: {}
            goal: {goal_obs} where goal_obs must be N x H+1 x ...
            batch:

        Returns:
            AttrDict with {action_sequence, results {costs, order} }
        """
        # generate random sequence
        batch_size = goal.goal_obs.shape[0]  # requires goal_obs to be a key
        device = goal.goal_obs.device

        if not batch:
            observation = observation.leaf_apply(lambda arr: arr.unsqueeze(
                0).repeat_interleave(self._pop_size, dim=0))
            goal = goal.leaf_apply(lambda arr: arr.unsqueeze(0).
                                   repeat_interleave(self._pop_size, dim=0))
        else:
            observation = observation.leaf_apply(
                lambda arr: arr.repeat_interleave(self._pop_size, dim=0))
            goal = goal.leaf_apply(
                lambda arr: arr.repeat_interleave(self._pop_size, dim=0))

        action_sequence = self._env_spec.get_uniform(
            self._action_names,
            batch_size=batch_size * self._pop_size * self._horizon)
        action_sequence.leaf_modify(lambda x: split_dim(
            torch.from_numpy(x).to(device),
            dim=0,
            new_shape=[batch_size * self._pop_size, self._horizon]))

        def resample_and_flatten(vs):
            old_acseq = vs[0]
            mean, std = vs[1], vs[2]
            sample = torch.randn_like(old_acseq) * std + mean
            d = AttrDict(act=sample)
            self._env_spec.clip(d, ['act'])
            return d.act.view([-1] + list(old_acseq.shape[2:]))

        best_initial_act = None
        results = None
        for i in range(self._max_iters):
            # run the model
            results = model(action_sequence, observation, goal)

            # view as (B, Pop, ...)
            action_sequence.leaf_modify(
                lambda x: split_dim(x, 0, [batch_size, self._pop_size]))
            results.leaf_modify(
                lambda x: split_dim(x, 0, [batch_size, self._pop_size]))

            results.order = torch.argsort(results.costs,
                                          dim=1)  # lowest to highest

            best = results.order[:, :self._num_elites]
            best = best.unsqueeze(-1).unsqueeze(-1).expand(
                (-1, -1, self._horizon, self._act_dim))
            best_act_seq = action_sequence.leaf_apply(
                lambda x: torch.gather(x, 1, best))
            best_initial_act = best_act_seq.leaf_apply(
                lambda x: x[:, 0, 0])  # where x is (B, Pop, H ...)
            means = best_act_seq.leaf_apply(lambda x: x.mean(1, keepdim=True))
            stds = best_act_seq.leaf_apply(lambda x: x.std(1, keepdim=True))

            if i < self._max_iters - 1:
                # resampling
                action_sequence = AttrDict.leaf_combine_and_apply(
                    [action_sequence, means, stds], resample_and_flatten)

        # act is (B, actdim)
        best_initial_act.action_sequence = action_sequence  # (B*Pop, horizon, actdim)
        best_initial_act.results = results

        return best_initial_act