Exemplo n.º 1
0
 def denormalize(self, v):
     mean = ptu.np_to_var(self.mean, requires_grad=False)
     std = ptu.np_to_var(self.std, requires_grad=False)
     if v.dim() == 2:
         mean = mean.unsqueeze(0)
         std = std.unsqueeze(0)
     return mean + v * std
Exemplo n.º 2
0
    def get_action(self, current_ob, goal, num_steps_left):
        if (self.replan_every_time_step
                or self.t_in_plan == self.planning_horizon
                or self.last_solution is None):
            if self.dynamic_lm and self.best_obs_seq is not None:
                error = np.linalg.norm(current_ob -
                                       self.best_obs_seq[self.t_in_plan + 1])
                self.update_lagrange_multiplier(error)

            full_solution = self.replan(current_ob, goal)

            x_torch = ptu.np_to_var(full_solution, requires_grad=True)
            current_ob_torch = ptu.np_to_var(current_ob)

            obs, next_obs = self.batchify(x_torch, current_ob_torch)
            actions = self.tdm_policy(
                observations=obs,
                goals=next_obs,
                num_steps_left=self.num_steps_left_pytorch,
            )
            self.best_action_seq = ptu.get_numpy(actions)
            self.best_obs_seq = np.array([current_ob] +
                                         [ptu.get_numpy(o) for o in next_obs])

            self.last_solution = full_solution
            self.t_in_plan = 0

        agent_info = dict(
            best_action_seq=self.best_action_seq[self.t_in_plan:],
            best_obs_seq=self.best_obs_seq[self.t_in_plan:],
        )
        action = self.best_action_seq[self.t_in_plan]
        self.t_in_plan += 1

        return action, agent_info
Exemplo n.º 3
0
    def get_action(self, obs):
        state = self.expand_np_to_var(obs)
        first_sampled_actions = self.sample_actions()
        action = ptu.np_to_var(first_sampled_actions)
        next_state = ptu.np_to_var(self.env.sample_states(self.sample_size))

        penalties = []
        for i in range(self.horizon):
            constraint_penalty = self.qf(
                state,
                action,
                self.env.convert_obs_to_goal_states_pytorch(next_state),
                self._tau_batch,
            )**2
            penalties.append(
                - self.constraint_weight * constraint_penalty
            )

            action = ptu.np_to_var(
                self.env.sample_actions(self.sample_size)
            )
            state = next_state
            next_state = ptu.np_to_var(self.env.sample_states(self.sample_size))
        reward = self.reward(state, action, next_state)
        final_score = reward + sum(penalties)
        max_i = np.argmax(ptu.get_numpy(final_score))
        return first_sampled_actions[max_i], {}
Exemplo n.º 4
0
    def _get_action(self, current_ob, goal):
        if (self.replan_every_time_step
                or self.t_in_plan == self.planning_horizon
                or self.last_solution is None):
            full_solution = self.replan(current_ob, goal)

            x_torch = ptu.np_to_var(full_solution, requires_grad=True)
            current_ob_torch = ptu.np_to_var(current_ob)

            _, next_obs = self.batchify(x_torch, current_ob_torch)
            self.subgoal_seq = np.array([current_ob] +
                                        [ptu.get_numpy(o) for o in next_obs])
            self.planned_action_seq = self.goal_reaching_policy.eval_np(
                self.subgoal_seq[:-1], self.subgoal_seq[1:],
                np.zeros((self.planning_horizon, 1)))
            self.last_solution = full_solution
            self.t_in_plan = 0

        action = self.planned_action_seq[self.t_in_plan]
        new_goal = self.subgoal_seq[self.t_in_plan + 1]
        self.current_goal = new_goal
        oracle_qmax_action = self.get_oracle_qmax_action(current_ob, new_goal)
        if self.use_oracle_argmax_policy:
            action = oracle_qmax_action

        agent_info = dict(
            planned_action_seq=self.planned_action_seq[self.t_in_plan:],
            subgoal_seq=self.subgoal_seq[self.t_in_plan:],
            oracle_qmax_action=oracle_qmax_action,
            full_action_seq=self.planned_action_seq,
            full_obs_seq=self.subgoal_seq,
        )

        self.t_in_plan += 1
        return action, agent_info
Exemplo n.º 5
0
 def next_state(self, state, action, goal_state, discount):
     state = ptu.np_to_var(np.expand_dims(state, 0))
     action = ptu.np_to_var(np.expand_dims(action, 0))
     goal_state = ptu.np_to_var(np.expand_dims(goal_state, 0))
     discount = ptu.np_to_var(np.array([[discount]]))
     return ptu.get_numpy(
         self.qf(state, action, goal_state, discount) + state)[0]
Exemplo n.º 6
0
 def get_debug_batch(self, train=True):
     dataset = self.train_dataset if train else self.test_dataset
     X, Y = dataset
     ind = np.random.randint(0, Y.shape[0], self.batch_size)
     X = X[ind, :]
     Y = Y[ind, :]
     return ptu.np_to_var(X), ptu.np_to_var(Y)
Exemplo n.º 7
0
def train_encoder(encoder, decoder, encoder_opt):
    batch, true_latents = swirl_data(BS)
    batch = ptu.np_to_var(batch)

    latents, means, log_stds, stds = encoder.get_encoding_and_suff_stats(
        batch
    )
    kl = kl_to_prior(means, log_stds, stds)

    latents = encoder.encode(batch)
    decoder_output = decoder(latents)
    decoder_means = decoder_output[:, 0:2]
    decoder_log_stds = decoder_output[:, 2:4]
    distribution = Normal(decoder_means, decoder_log_stds.exp())
    reconstruction_log_prob = distribution.log_prob(batch).sum(dim=1)

    # elbo = - kl + reconstruction_log_prob
    # loss = - elbo.mean()
    loss = - reconstruction_log_prob.mean()
    # This is the second place where we cheat:
    latent_loss = ((ptu.np_to_var(true_latents) - latents) ** 2).mean()
    loss = loss# + latent_loss
    encoder_opt.zero_grad()
    loss.backward()
    encoder_opt.step()
    return loss
Exemplo n.º 8
0
def pretrain_encoder(encoder, opt):
    losses = []
    for _ in range(1000):
        x_np, y_np = swirl_data(BS)
        x = ptu.np_to_var(x_np)
        y = ptu.np_to_var(y_np)
        y_hat = encoder.encode(x)
        loss = ((y_hat - y) ** 2).mean()
        opt.zero_grad()
        loss.backward()
        opt.step()

        losses.append(loss.data.numpy())

    if VERBOSE:
        x_np, y_np = swirl_data(N_VIS)
        x = ptu.np_to_var(x_np)
        y_hat = encoder.encode(x)
        y_hat_np = y_hat.data.numpy()
        x_hat_np = t_to_xy(y_hat_np[:, 0])

        plt.subplot(2, 1, 1)
        plt.plot(np.array(losses))
        plt.title("Training Loss")

        plt.subplot(2, 1, 2)
        plt.plot(x_np[:, 0], x_np[:, 1], '.')
        plt.plot(x_hat_np[:, 0], x_hat_np[:, 1], '.')
        plt.title("Samples")
        plt.legend(["Samples", "Estimates"])
        plt.show()
Exemplo n.º 9
0
 def get_action(self, obs):
     obs, goals, taus = split_flat_obs(obs[None],
                                       self.env.observation_space.low.size,
                                       self.env.goal_dim)
     sampled_actions = self.sample_actions()
     first_sampled_actions = sampled_actions.copy()
     actions = ptu.np_to_var(sampled_actions)
     next_obs = self.expand_np_to_var(obs[0])
     goals = self.expand_np_to_var(goals[0])
     taus = self.expand_np_to_var(taus[0])
     costs = 0
     for i in range(self.mpc_horizon):
         curr_obs = next_obs
         if i > 0:
             sampled_actions = self.sample_actions()
             actions = ptu.np_to_var(sampled_actions)
         flat_obs = merge_into_flat_obs(
             curr_obs,
             goals,
             taus,
         )
         obs_delta = self.debug_qf(flat_obs,
                                   actions,
                                   return_internal_prediction=True)
         next_obs = curr_obs + obs_delta
         next_features = self.env.convert_obs_to_goals(next_obs)
         costs += (next_features[:, :7] - goals[:, :7])**2
     costs_np = ptu.get_numpy(costs).sum(1)
     min_i = np.argmin(costs_np)
     return first_sampled_actions[min_i], {}
Exemplo n.º 10
0
    def get_action(self, ob):
        if self.last_solution is None or not self.warm_start:
            init_solution = []
            for _ in range(self.planning_horizon):
                init_solution.append(
                    np.repeat(ob[None], self.num_particles, axis=0))

            self.last_solution = np.hstack(init_solution)

        ob = self._expand_np_to_var(ob)
        actions_np = np.hstack(
            [self.sample_actions() for _ in range(self.planning_horizon)])
        actions = ptu.np_to_var(actions_np)
        next_states = ptu.np_to_var(self.last_solution, requires_grad=True)

        optimizer = optim.Adam([next_states], lr=self.learning_rate)
        for i in range(self.num_grad_steps):
            constraint_loss = self.constraint_fctn(ob, actions, next_states)
            optimizer.zero_grad()
            constraint_loss.sum().backward()
            optimizer.step()

        final_loss = (self.cost_function(ob, actions, next_states) +
                      self.lagrange_multiplier *
                      self.constraint_fctn(ob, actions, next_states))
        self.last_solution = ptu.get_numpy(next_states)
        final_loss_np = ptu.get_numpy(final_loss).sum(axis=1)
        min_i = np.argmin(final_loss_np)
        action = actions_np[min_i, :self.action_dim]
        return action, {}
Exemplo n.º 11
0
 def evaluate(x, y):
     action = np.array([x, y])
     action = ptu.np_to_var(action).unsqueeze(0)
     state = ptu.np_to_var(start_state).unsqueeze(0)
     goal_states = ptu.np_to_var(goal_state).unsqueeze(0)
     discount = ptu.np_to_var(np.array([[0]]))
     out = qf(state, action, goal_states, discount)
     return out.data.numpy()
Exemplo n.º 12
0
 def get_np_action(self, state_np, goal_state_np):
     return ptu.get_numpy(
         self.policy(
             ptu.np_to_var(np.expand_dims(state_np, 0)),
             ptu.np_to_var(np.expand_dims(goal_state_np, 0)),
             self._tau_expanded_torch,
         ).squeeze(0)
     )
Exemplo n.º 13
0
    def get_action(self, obs):
        sampled_actions = self.sample_actions()
        first_sampled_actions = sampled_actions.copy()
        all_actions_np = [first_sampled_actions]
        actions = ptu.np_to_var(sampled_actions)
        next_obs = self.expand_np_to_var(obs)
        all_obs_torch = [next_obs]
        costs = 0
        all_costs = []
        for i in range(self.mpc_horizon):
            curr_obs = next_obs
            if i > 0:
                sampled_actions = self.sample_actions()
                all_actions_np.append(sampled_actions)
                actions = ptu.np_to_var(sampled_actions)
            next_obs = curr_obs + self.dynamics_model(curr_obs, actions)
            all_obs_torch.append(next_obs)
            new_costs = self.cost_fn(
                ptu.get_numpy(curr_obs),
                ptu.get_numpy(actions),
                ptu.get_numpy(next_obs),
            )
            costs = costs + new_costs
            all_costs.append(new_costs)

        # Reward sum of costs or just last time step?
        # min_i = np.argmin(costs)
        min_costs = np.array(all_costs).min(0)
        min_i = np.argmin(min_costs)

        # For Point2d u-shaped wall
        # best_action_seq = [action_t[min_i, :] for action_t in all_actions_np]
        # best_obs_seq = [
        #     ptu.get_numpy(ob_t[min_i, :]) for ob_t in all_obs_torch
        # ]
        #
        # real_obs_seq = self.env.wrapped_env.wrapped_env.true_states(obs, best_action_seq)
        # self.ax1.clear()
        # self.env.wrapped_env.wrapped_env.plot_trajectory(
        #     self.ax1,
        #     np.array(best_obs_seq),
        #     np.array(best_action_seq),
        #     goal=self.env.wrapped_env.wrapped_env._target_position,
        # )
        # self.ax1.set_title("imagined")
        # self.ax2.clear()
        # self.env.wrapped_env.wrapped_env.plot_trajectory(
        #     self.ax2,
        #     np.array(real_obs_seq),
        #     np.array(best_action_seq),
        #     goal=self.env.wrapped_env.wrapped_env._target_position,
        # )
        # self.ax2.set_title("real")
        # plt.draw()
        # plt.pause(0.001)

        return first_sampled_actions[min_i], {}
Exemplo n.º 14
0
 def normalize(self, v, clip_range=None):
     if clip_range is None:
         clip_range = self.default_clip_range
     mean = ptu.np_to_var(self.mean, requires_grad=False)
     std = ptu.np_to_var(self.std, requires_grad=False)
     if v.dim() == 2:
         # Unsqueeze along the batch use automatic broadcasting
         mean = mean.unsqueeze(0)
         std = std.unsqueeze(0)
     return torch.clamp((v - mean) / std, -clip_range, clip_range)
Exemplo n.º 15
0
 def get_batch_smooth(self, train=True):
     dataset = self.train_dataset if train else self.test_dataset
     ind = np.random.randint(0, len(dataset), self.batch_size)
     samples = dataset[ind, :]
     samples = normalize_image(samples)
     if self.normalize:
         samples = ((samples - self.train_data_mean) + 1) / 2
     x_next, x = samples[:, :self.x_next_index], samples[:,
                                                         self.x_next_index:]
     return ptu.np_to_var(x_next), ptu.np_to_var(x)
Exemplo n.º 16
0
 def _action_cost(self, x, current_ob, goal):
     x = ptu.np_to_var(x, requires_grad=True)
     actions = x.unsqueeze(0)
     current_obs = ptu.np_to_var(current_ob[None])
     goals = ptu.np_to_var(goal[None])
     num_steps_left = ptu.np_to_var(np.zeros((1, 1)))
     prob_reach = self.beta_q(current_obs, actions, goals, num_steps_left)
     loss = -prob_reach
     loss_np = ptu.get_numpy(prob_reach)[0].astype(np.float64)
     loss.backward()
     gradient_np = ptu.get_numpy(x.grad).astype(np.float64)
     return loss_np, gradient_np
Exemplo n.º 17
0
 def choose_action_to_reach_adam(self, current_ob, goal):
     n_parts = 100
     x0 = np.vstack(
         [self.env.action_space.sample() for _ in range(n_parts)])
     current_obs = ptu.np_to_var(current_ob).unsqueeze(0).repeat(n_parts, 1)
     goals = ptu.np_to_var(goal).unsqueeze(0).repeat(n_parts, 1)
     num_steps_left = ptu.np_to_var(np.zeros((n_parts, 1)))
     best_action, _ = fmin_adam_torch(
         self._action_cost_batch,
         x0,
         f_args=(current_obs, goals, num_steps_left),
     )
     return best_action
Exemplo n.º 18
0
 def cost_function(self, x, current_ob):
     self.forward -= time.time()
     x = ptu.np_to_var(x, requires_grad=True)
     current_ob = ptu.np_to_var(current_ob)
     loss = (self.lagrange_multipler *
             self._feasibility_cost_function(x, current_ob) +
             self._env_cost_function(x, current_ob))
     loss_np = ptu.get_numpy(loss)[0].astype(np.float64)
     self.forward += time.time()
     self.backward -= time.time()
     loss.squeeze(0).backward()
     gradient_np = ptu.get_numpy(x.grad).astype(np.float64)
     self.backward += time.time()
     return loss_np, gradient_np
Exemplo n.º 19
0
 def get_loss(training=False):
     buffer = replay_buffer.get_replay_buffer(training)
     batch = buffer.random_batch(batch_size)
     obs = ptu.np_to_var(batch['observations'], requires_grad=False)
     goals = ptu.np_to_var(batch['goal_states'], requires_grad=False)
     goal = goal_chooser(obs, goals)
     actions = argmax_q(obs, goal, discount)
     final_state_predicted = goal_conditioned_model(
         obs,
         actions,
         goal,
         discount,
     ) + obs
     rewards = goal_chooser.reward_function(final_state_predicted, goals)
     return -rewards.mean()
Exemplo n.º 20
0
 def _cost_function(self, x, order):
     x = ptu.np_to_var(x, requires_grad=True)
     loss = 0
     for action, next_state in self.split(x):
         next_features_predicted = next_state[self.goal_slice]
         desired_features = ptu.np_to_var(
             self.env.multitask_goal[self.multitask_goal_slice] *
             np.ones(next_features_predicted.shape))
         diff = next_features_predicted - desired_features
         loss += (diff**2).sum()
     if order == 0:
         return ptu.get_numpy(loss)[0]
     elif order == 1:
         loss.squeeze(0).backward()
         return ptu.get_numpy(x.grad)
Exemplo n.º 21
0
 def get_action(self, obs):
     obs_pytorch = self.expand_np_to_var(obs)
     sampled_goal_state = ptu.np_to_var(
         self.env.sample_dimensions_irrelevant_to_oc(
             self._goal_np, obs, self.sample_size
         )
     )
     actions = self.argmax_q(
         obs_pytorch,
         sampled_goal_state,
         self._tau_batch,
     )
     # actions = self.env.sample_actions(self.sample_size)
     # actions = ptu.np_to_var(actions)
     # Implicit models only predict future goals
     final_goal_predicted = self.implicit_model(
         obs_pytorch,
         actions,
         sampled_goal_state,
         self._tau_batch,
         only_return_next_state=True,
     )
     rewards = self.rewards_np(
         obs_pytorch,
         final_goal_predicted
     )
     max_i = np.argmax(rewards)
     return ptu.get_numpy(actions[max_i]), {}
Exemplo n.º 22
0
 def replan(self, current_ob, goal):
     if self.last_solution is None or not self.warm_start:
         solution = []
         for i in range(self.planning_horizon):
             solution.append(current_ob)
         self.last_solution = np.hstack(solution)
     self.desired_features_torch = ptu.np_to_var(goal[None].repeat(
         self.planning_horizon, 0))
     self.forward = self.backward = 0
     start = time.time()
     x, f, d = optimize.fmin_l_bfgs_b(self.cost_function,
                                      self.last_solution,
                                      args=(current_ob, ),
                                      bounds=self.bounds,
                                      **self.solver_kwargs)
     total = time.time() - start
     self.totals.append(total)
     # print("total forward: {}".format(self.forward))
     # print("total backward: {}".format(self.backward))
     # print("total: {}".format(total))
     # print("extra: {}".format(total - self.forward - self.backward))
     # print("total mean: {}".format(np.mean(self.totals)))
     warnflag = d['warnflag']
     if warnflag != 0:
         if warnflag == 1:
             print("too many function evaluations or too many iterations")
         else:
             print(d['task'])
     return x
Exemplo n.º 23
0
 def get_action(self, obs):
     action_inits = self.sample_actions()
     actions = ptu.np_to_var(action_inits, requires_grad=True)
     obs = self.expand_np_to_var(obs)
     optimizer = optim.Adam([actions], self.learning_rate)
     losses = -self.qf(
         obs,
         actions,
         self._goal_batch,
         self._tau_batch,
     )
     for _ in range(self.num_gradient_steps):
         loss = losses.mean()
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
         losses = -self.qf(
             obs,
             actions,
             self._goal_batch,
             self._tau_batch,
         )
     losses_np = ptu.get_numpy(losses)
     best_action_i = np.argmin(losses_np)
     return ptu.get_numpy(actions[best_action_i, :]), {}
Exemplo n.º 24
0
def simulate_policy(args):
    ptu.set_gpu_mode(True)
    model = pickle.load(open(args.file, "rb"))  # joblib.load(args.file)
    model.to(ptu.device)
    imgs = np.load(args.imgfile)
    import ipdb
    ipdb.set_trace()
    z = model.encode(ptu.np_to_var(imgs))
    samples = model.decode(z).cpu()

    recon_imgs = samples.data.view(64, model.input_channels, model.imsize,
                                   model.imsize)
    recon_imgs = recon_imgs.cpu()
    grid = make_grid(recon_imgs, nrow=8)
    ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy()
    im = Image.fromarray(ndarr)
    im.show()
    # cv2.imshow('img', im)
    # cv2.waitKey(1)
    # for sample in samples:
    #     tensor = tensor.cpu()
    #     img = ptu.get_numpy(tensor)
    comparison = torch.cat([
        recon_imgs,
        imgs,
    ])
    save_dir = osp.join(logger.get_snapshot_dir(), 'r%d.png' % epoch)
    save_image(comparison.data.cpu(), save_dir, nrow=n)
Exemplo n.º 25
0
 def expand_np_to_var(self, array):
     array_expanded = np.repeat(
         np.expand_dims(array, 0),
         self.sample_size,
         axis=0
     )
     return ptu.np_to_var(array_expanded, requires_grad=False)
Exemplo n.º 26
0
def fmin_adam_torch(
    batch_torch_f,
    x0_np,
    f_args=None,
    f_kwargs=None,
    lr=1e-3,
    num_steps=100,
):
    if f_args is None:
        f_args = tuple()
    if f_kwargs is None:
        f_kwargs = {}

    x = ptu.np_to_var(x0_np, requires_grad=True)
    optimizer = Adam([x], lr=lr)
    for _ in range(num_steps):
        loss = batch_torch_f(x, *f_args, **f_kwargs).sum()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    final_values_np = ptu.get_numpy(batch_torch_f(x, *f_args, **f_kwargs))
    final_x_np = ptu.get_numpy(x)
    min_i = np.argmin(final_values_np)
    return final_x_np[min_i], final_values_np[min_i]
Exemplo n.º 27
0
 def denormalize_scale(self, v):
     """
     Only denormalize the scale. Do not add the mean.
     """
     std = ptu.np_to_var(self.std, requires_grad=False)
     if v.dim() == 2:
         std = std.unsqueeze(0)
     return v * std
Exemplo n.º 28
0
    def cost_function(self, x, current_ob, verbose=False):
        self.forward -= time.time()
        x = ptu.np_to_var(x, requires_grad=True)
        current_ob = ptu.np_to_var(current_ob)
        env_costs = self._env_cost_function(x, current_ob)
        probabilities = self._feasibility_probabilities(x, current_ob)
        if self.use_max_cost:
            not_reached_cost = self.max_cost
        else:
            not_reached_cost = ((current_ob[self.goal_slice] -
                                 self.desired_features_torch)**2).sum()
        if verbose:
            print("---")
            print("env_costs", env_costs)
            print("not reached cost", not_reached_cost)
            print("probabilities", probabilities)
        if self.only_use_terminal_env_loss:
            final_prob = torch.prod(probabilities)
            loss = env_costs * (final_prob +
                                1) + (1 - final_prob) * not_reached_cost
            # if verbose:
            #     print("final prob", final_prob)
        else:
            """
            argmin_s c(s) p(s) + C_max (1-p(s))
             = argmin_s (c(s) - C_max) p(s)
             = argmin_s -log(C_max - c(s)) - log p(s)

            However, doing the cum-probs thing is better
            (i.e. it's a tighter lower bound)
            """
            # loss = -torch.log(
            #     self.planning_horizon * not_reached_cost - env_costs
            # ).sum() - traj_log_prob
            cum_probs = self._compute_cum_prob(probabilities)
            loss = env_costs * cum_probs + (1 - cum_probs) * not_reached_cost
            # if verbose:
            #     print("cum_probs", cum_probs)
            loss = loss.sum()
        loss_np = ptu.get_numpy(loss)[0].astype(np.float64)
        self.forward += time.time()
        self.backward -= time.time()
        loss.backward()
        gradient_np = ptu.get_numpy(x.grad).astype(np.float64)
        self.backward += time.time()
        return loss_np, gradient_np
Exemplo n.º 29
0
 def reward(self, state, action, next_state):
     rewards_np = self.env.compute_rewards(
         None,
         None,
         ptu.get_numpy(next_state),
         ptu.get_numpy(self._goal_batch),
     )
     return ptu.np_to_var(rewards_np)
Exemplo n.º 30
0
 def get_batch(self, train=True):
     dataset = self.train_dataset if train else self.test_dataset
     ind = np.random.randint(0, len(dataset), self.batch_size)
     samples = dataset[ind, :]
     samples = normalize_image(samples)
     if self.normalize:
         samples = ((samples - self.train_data_mean) + 1) / 2
     return ptu.np_to_var(samples)