def denormalize(self, v): mean = ptu.np_to_var(self.mean, requires_grad=False) std = ptu.np_to_var(self.std, requires_grad=False) if v.dim() == 2: mean = mean.unsqueeze(0) std = std.unsqueeze(0) return mean + v * std
def get_action(self, current_ob, goal, num_steps_left): if (self.replan_every_time_step or self.t_in_plan == self.planning_horizon or self.last_solution is None): if self.dynamic_lm and self.best_obs_seq is not None: error = np.linalg.norm(current_ob - self.best_obs_seq[self.t_in_plan + 1]) self.update_lagrange_multiplier(error) full_solution = self.replan(current_ob, goal) x_torch = ptu.np_to_var(full_solution, requires_grad=True) current_ob_torch = ptu.np_to_var(current_ob) obs, next_obs = self.batchify(x_torch, current_ob_torch) actions = self.tdm_policy( observations=obs, goals=next_obs, num_steps_left=self.num_steps_left_pytorch, ) self.best_action_seq = ptu.get_numpy(actions) self.best_obs_seq = np.array([current_ob] + [ptu.get_numpy(o) for o in next_obs]) self.last_solution = full_solution self.t_in_plan = 0 agent_info = dict( best_action_seq=self.best_action_seq[self.t_in_plan:], best_obs_seq=self.best_obs_seq[self.t_in_plan:], ) action = self.best_action_seq[self.t_in_plan] self.t_in_plan += 1 return action, agent_info
def get_action(self, obs): state = self.expand_np_to_var(obs) first_sampled_actions = self.sample_actions() action = ptu.np_to_var(first_sampled_actions) next_state = ptu.np_to_var(self.env.sample_states(self.sample_size)) penalties = [] for i in range(self.horizon): constraint_penalty = self.qf( state, action, self.env.convert_obs_to_goal_states_pytorch(next_state), self._tau_batch, )**2 penalties.append( - self.constraint_weight * constraint_penalty ) action = ptu.np_to_var( self.env.sample_actions(self.sample_size) ) state = next_state next_state = ptu.np_to_var(self.env.sample_states(self.sample_size)) reward = self.reward(state, action, next_state) final_score = reward + sum(penalties) max_i = np.argmax(ptu.get_numpy(final_score)) return first_sampled_actions[max_i], {}
def _get_action(self, current_ob, goal): if (self.replan_every_time_step or self.t_in_plan == self.planning_horizon or self.last_solution is None): full_solution = self.replan(current_ob, goal) x_torch = ptu.np_to_var(full_solution, requires_grad=True) current_ob_torch = ptu.np_to_var(current_ob) _, next_obs = self.batchify(x_torch, current_ob_torch) self.subgoal_seq = np.array([current_ob] + [ptu.get_numpy(o) for o in next_obs]) self.planned_action_seq = self.goal_reaching_policy.eval_np( self.subgoal_seq[:-1], self.subgoal_seq[1:], np.zeros((self.planning_horizon, 1))) self.last_solution = full_solution self.t_in_plan = 0 action = self.planned_action_seq[self.t_in_plan] new_goal = self.subgoal_seq[self.t_in_plan + 1] self.current_goal = new_goal oracle_qmax_action = self.get_oracle_qmax_action(current_ob, new_goal) if self.use_oracle_argmax_policy: action = oracle_qmax_action agent_info = dict( planned_action_seq=self.planned_action_seq[self.t_in_plan:], subgoal_seq=self.subgoal_seq[self.t_in_plan:], oracle_qmax_action=oracle_qmax_action, full_action_seq=self.planned_action_seq, full_obs_seq=self.subgoal_seq, ) self.t_in_plan += 1 return action, agent_info
def next_state(self, state, action, goal_state, discount): state = ptu.np_to_var(np.expand_dims(state, 0)) action = ptu.np_to_var(np.expand_dims(action, 0)) goal_state = ptu.np_to_var(np.expand_dims(goal_state, 0)) discount = ptu.np_to_var(np.array([[discount]])) return ptu.get_numpy( self.qf(state, action, goal_state, discount) + state)[0]
def get_debug_batch(self, train=True): dataset = self.train_dataset if train else self.test_dataset X, Y = dataset ind = np.random.randint(0, Y.shape[0], self.batch_size) X = X[ind, :] Y = Y[ind, :] return ptu.np_to_var(X), ptu.np_to_var(Y)
def train_encoder(encoder, decoder, encoder_opt): batch, true_latents = swirl_data(BS) batch = ptu.np_to_var(batch) latents, means, log_stds, stds = encoder.get_encoding_and_suff_stats( batch ) kl = kl_to_prior(means, log_stds, stds) latents = encoder.encode(batch) decoder_output = decoder(latents) decoder_means = decoder_output[:, 0:2] decoder_log_stds = decoder_output[:, 2:4] distribution = Normal(decoder_means, decoder_log_stds.exp()) reconstruction_log_prob = distribution.log_prob(batch).sum(dim=1) # elbo = - kl + reconstruction_log_prob # loss = - elbo.mean() loss = - reconstruction_log_prob.mean() # This is the second place where we cheat: latent_loss = ((ptu.np_to_var(true_latents) - latents) ** 2).mean() loss = loss# + latent_loss encoder_opt.zero_grad() loss.backward() encoder_opt.step() return loss
def pretrain_encoder(encoder, opt): losses = [] for _ in range(1000): x_np, y_np = swirl_data(BS) x = ptu.np_to_var(x_np) y = ptu.np_to_var(y_np) y_hat = encoder.encode(x) loss = ((y_hat - y) ** 2).mean() opt.zero_grad() loss.backward() opt.step() losses.append(loss.data.numpy()) if VERBOSE: x_np, y_np = swirl_data(N_VIS) x = ptu.np_to_var(x_np) y_hat = encoder.encode(x) y_hat_np = y_hat.data.numpy() x_hat_np = t_to_xy(y_hat_np[:, 0]) plt.subplot(2, 1, 1) plt.plot(np.array(losses)) plt.title("Training Loss") plt.subplot(2, 1, 2) plt.plot(x_np[:, 0], x_np[:, 1], '.') plt.plot(x_hat_np[:, 0], x_hat_np[:, 1], '.') plt.title("Samples") plt.legend(["Samples", "Estimates"]) plt.show()
def get_action(self, obs): obs, goals, taus = split_flat_obs(obs[None], self.env.observation_space.low.size, self.env.goal_dim) sampled_actions = self.sample_actions() first_sampled_actions = sampled_actions.copy() actions = ptu.np_to_var(sampled_actions) next_obs = self.expand_np_to_var(obs[0]) goals = self.expand_np_to_var(goals[0]) taus = self.expand_np_to_var(taus[0]) costs = 0 for i in range(self.mpc_horizon): curr_obs = next_obs if i > 0: sampled_actions = self.sample_actions() actions = ptu.np_to_var(sampled_actions) flat_obs = merge_into_flat_obs( curr_obs, goals, taus, ) obs_delta = self.debug_qf(flat_obs, actions, return_internal_prediction=True) next_obs = curr_obs + obs_delta next_features = self.env.convert_obs_to_goals(next_obs) costs += (next_features[:, :7] - goals[:, :7])**2 costs_np = ptu.get_numpy(costs).sum(1) min_i = np.argmin(costs_np) return first_sampled_actions[min_i], {}
def get_action(self, ob): if self.last_solution is None or not self.warm_start: init_solution = [] for _ in range(self.planning_horizon): init_solution.append( np.repeat(ob[None], self.num_particles, axis=0)) self.last_solution = np.hstack(init_solution) ob = self._expand_np_to_var(ob) actions_np = np.hstack( [self.sample_actions() for _ in range(self.planning_horizon)]) actions = ptu.np_to_var(actions_np) next_states = ptu.np_to_var(self.last_solution, requires_grad=True) optimizer = optim.Adam([next_states], lr=self.learning_rate) for i in range(self.num_grad_steps): constraint_loss = self.constraint_fctn(ob, actions, next_states) optimizer.zero_grad() constraint_loss.sum().backward() optimizer.step() final_loss = (self.cost_function(ob, actions, next_states) + self.lagrange_multiplier * self.constraint_fctn(ob, actions, next_states)) self.last_solution = ptu.get_numpy(next_states) final_loss_np = ptu.get_numpy(final_loss).sum(axis=1) min_i = np.argmin(final_loss_np) action = actions_np[min_i, :self.action_dim] return action, {}
def evaluate(x, y): action = np.array([x, y]) action = ptu.np_to_var(action).unsqueeze(0) state = ptu.np_to_var(start_state).unsqueeze(0) goal_states = ptu.np_to_var(goal_state).unsqueeze(0) discount = ptu.np_to_var(np.array([[0]])) out = qf(state, action, goal_states, discount) return out.data.numpy()
def get_np_action(self, state_np, goal_state_np): return ptu.get_numpy( self.policy( ptu.np_to_var(np.expand_dims(state_np, 0)), ptu.np_to_var(np.expand_dims(goal_state_np, 0)), self._tau_expanded_torch, ).squeeze(0) )
def get_action(self, obs): sampled_actions = self.sample_actions() first_sampled_actions = sampled_actions.copy() all_actions_np = [first_sampled_actions] actions = ptu.np_to_var(sampled_actions) next_obs = self.expand_np_to_var(obs) all_obs_torch = [next_obs] costs = 0 all_costs = [] for i in range(self.mpc_horizon): curr_obs = next_obs if i > 0: sampled_actions = self.sample_actions() all_actions_np.append(sampled_actions) actions = ptu.np_to_var(sampled_actions) next_obs = curr_obs + self.dynamics_model(curr_obs, actions) all_obs_torch.append(next_obs) new_costs = self.cost_fn( ptu.get_numpy(curr_obs), ptu.get_numpy(actions), ptu.get_numpy(next_obs), ) costs = costs + new_costs all_costs.append(new_costs) # Reward sum of costs or just last time step? # min_i = np.argmin(costs) min_costs = np.array(all_costs).min(0) min_i = np.argmin(min_costs) # For Point2d u-shaped wall # best_action_seq = [action_t[min_i, :] for action_t in all_actions_np] # best_obs_seq = [ # ptu.get_numpy(ob_t[min_i, :]) for ob_t in all_obs_torch # ] # # real_obs_seq = self.env.wrapped_env.wrapped_env.true_states(obs, best_action_seq) # self.ax1.clear() # self.env.wrapped_env.wrapped_env.plot_trajectory( # self.ax1, # np.array(best_obs_seq), # np.array(best_action_seq), # goal=self.env.wrapped_env.wrapped_env._target_position, # ) # self.ax1.set_title("imagined") # self.ax2.clear() # self.env.wrapped_env.wrapped_env.plot_trajectory( # self.ax2, # np.array(real_obs_seq), # np.array(best_action_seq), # goal=self.env.wrapped_env.wrapped_env._target_position, # ) # self.ax2.set_title("real") # plt.draw() # plt.pause(0.001) return first_sampled_actions[min_i], {}
def normalize(self, v, clip_range=None): if clip_range is None: clip_range = self.default_clip_range mean = ptu.np_to_var(self.mean, requires_grad=False) std = ptu.np_to_var(self.std, requires_grad=False) if v.dim() == 2: # Unsqueeze along the batch use automatic broadcasting mean = mean.unsqueeze(0) std = std.unsqueeze(0) return torch.clamp((v - mean) / std, -clip_range, clip_range)
def get_batch_smooth(self, train=True): dataset = self.train_dataset if train else self.test_dataset ind = np.random.randint(0, len(dataset), self.batch_size) samples = dataset[ind, :] samples = normalize_image(samples) if self.normalize: samples = ((samples - self.train_data_mean) + 1) / 2 x_next, x = samples[:, :self.x_next_index], samples[:, self.x_next_index:] return ptu.np_to_var(x_next), ptu.np_to_var(x)
def _action_cost(self, x, current_ob, goal): x = ptu.np_to_var(x, requires_grad=True) actions = x.unsqueeze(0) current_obs = ptu.np_to_var(current_ob[None]) goals = ptu.np_to_var(goal[None]) num_steps_left = ptu.np_to_var(np.zeros((1, 1))) prob_reach = self.beta_q(current_obs, actions, goals, num_steps_left) loss = -prob_reach loss_np = ptu.get_numpy(prob_reach)[0].astype(np.float64) loss.backward() gradient_np = ptu.get_numpy(x.grad).astype(np.float64) return loss_np, gradient_np
def choose_action_to_reach_adam(self, current_ob, goal): n_parts = 100 x0 = np.vstack( [self.env.action_space.sample() for _ in range(n_parts)]) current_obs = ptu.np_to_var(current_ob).unsqueeze(0).repeat(n_parts, 1) goals = ptu.np_to_var(goal).unsqueeze(0).repeat(n_parts, 1) num_steps_left = ptu.np_to_var(np.zeros((n_parts, 1))) best_action, _ = fmin_adam_torch( self._action_cost_batch, x0, f_args=(current_obs, goals, num_steps_left), ) return best_action
def cost_function(self, x, current_ob): self.forward -= time.time() x = ptu.np_to_var(x, requires_grad=True) current_ob = ptu.np_to_var(current_ob) loss = (self.lagrange_multipler * self._feasibility_cost_function(x, current_ob) + self._env_cost_function(x, current_ob)) loss_np = ptu.get_numpy(loss)[0].astype(np.float64) self.forward += time.time() self.backward -= time.time() loss.squeeze(0).backward() gradient_np = ptu.get_numpy(x.grad).astype(np.float64) self.backward += time.time() return loss_np, gradient_np
def get_loss(training=False): buffer = replay_buffer.get_replay_buffer(training) batch = buffer.random_batch(batch_size) obs = ptu.np_to_var(batch['observations'], requires_grad=False) goals = ptu.np_to_var(batch['goal_states'], requires_grad=False) goal = goal_chooser(obs, goals) actions = argmax_q(obs, goal, discount) final_state_predicted = goal_conditioned_model( obs, actions, goal, discount, ) + obs rewards = goal_chooser.reward_function(final_state_predicted, goals) return -rewards.mean()
def _cost_function(self, x, order): x = ptu.np_to_var(x, requires_grad=True) loss = 0 for action, next_state in self.split(x): next_features_predicted = next_state[self.goal_slice] desired_features = ptu.np_to_var( self.env.multitask_goal[self.multitask_goal_slice] * np.ones(next_features_predicted.shape)) diff = next_features_predicted - desired_features loss += (diff**2).sum() if order == 0: return ptu.get_numpy(loss)[0] elif order == 1: loss.squeeze(0).backward() return ptu.get_numpy(x.grad)
def get_action(self, obs): obs_pytorch = self.expand_np_to_var(obs) sampled_goal_state = ptu.np_to_var( self.env.sample_dimensions_irrelevant_to_oc( self._goal_np, obs, self.sample_size ) ) actions = self.argmax_q( obs_pytorch, sampled_goal_state, self._tau_batch, ) # actions = self.env.sample_actions(self.sample_size) # actions = ptu.np_to_var(actions) # Implicit models only predict future goals final_goal_predicted = self.implicit_model( obs_pytorch, actions, sampled_goal_state, self._tau_batch, only_return_next_state=True, ) rewards = self.rewards_np( obs_pytorch, final_goal_predicted ) max_i = np.argmax(rewards) return ptu.get_numpy(actions[max_i]), {}
def replan(self, current_ob, goal): if self.last_solution is None or not self.warm_start: solution = [] for i in range(self.planning_horizon): solution.append(current_ob) self.last_solution = np.hstack(solution) self.desired_features_torch = ptu.np_to_var(goal[None].repeat( self.planning_horizon, 0)) self.forward = self.backward = 0 start = time.time() x, f, d = optimize.fmin_l_bfgs_b(self.cost_function, self.last_solution, args=(current_ob, ), bounds=self.bounds, **self.solver_kwargs) total = time.time() - start self.totals.append(total) # print("total forward: {}".format(self.forward)) # print("total backward: {}".format(self.backward)) # print("total: {}".format(total)) # print("extra: {}".format(total - self.forward - self.backward)) # print("total mean: {}".format(np.mean(self.totals))) warnflag = d['warnflag'] if warnflag != 0: if warnflag == 1: print("too many function evaluations or too many iterations") else: print(d['task']) return x
def get_action(self, obs): action_inits = self.sample_actions() actions = ptu.np_to_var(action_inits, requires_grad=True) obs = self.expand_np_to_var(obs) optimizer = optim.Adam([actions], self.learning_rate) losses = -self.qf( obs, actions, self._goal_batch, self._tau_batch, ) for _ in range(self.num_gradient_steps): loss = losses.mean() optimizer.zero_grad() loss.backward() optimizer.step() losses = -self.qf( obs, actions, self._goal_batch, self._tau_batch, ) losses_np = ptu.get_numpy(losses) best_action_i = np.argmin(losses_np) return ptu.get_numpy(actions[best_action_i, :]), {}
def simulate_policy(args): ptu.set_gpu_mode(True) model = pickle.load(open(args.file, "rb")) # joblib.load(args.file) model.to(ptu.device) imgs = np.load(args.imgfile) import ipdb ipdb.set_trace() z = model.encode(ptu.np_to_var(imgs)) samples = model.decode(z).cpu() recon_imgs = samples.data.view(64, model.input_channels, model.imsize, model.imsize) recon_imgs = recon_imgs.cpu() grid = make_grid(recon_imgs, nrow=8) ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy() im = Image.fromarray(ndarr) im.show() # cv2.imshow('img', im) # cv2.waitKey(1) # for sample in samples: # tensor = tensor.cpu() # img = ptu.get_numpy(tensor) comparison = torch.cat([ recon_imgs, imgs, ]) save_dir = osp.join(logger.get_snapshot_dir(), 'r%d.png' % epoch) save_image(comparison.data.cpu(), save_dir, nrow=n)
def expand_np_to_var(self, array): array_expanded = np.repeat( np.expand_dims(array, 0), self.sample_size, axis=0 ) return ptu.np_to_var(array_expanded, requires_grad=False)
def fmin_adam_torch( batch_torch_f, x0_np, f_args=None, f_kwargs=None, lr=1e-3, num_steps=100, ): if f_args is None: f_args = tuple() if f_kwargs is None: f_kwargs = {} x = ptu.np_to_var(x0_np, requires_grad=True) optimizer = Adam([x], lr=lr) for _ in range(num_steps): loss = batch_torch_f(x, *f_args, **f_kwargs).sum() optimizer.zero_grad() loss.backward() optimizer.step() final_values_np = ptu.get_numpy(batch_torch_f(x, *f_args, **f_kwargs)) final_x_np = ptu.get_numpy(x) min_i = np.argmin(final_values_np) return final_x_np[min_i], final_values_np[min_i]
def denormalize_scale(self, v): """ Only denormalize the scale. Do not add the mean. """ std = ptu.np_to_var(self.std, requires_grad=False) if v.dim() == 2: std = std.unsqueeze(0) return v * std
def cost_function(self, x, current_ob, verbose=False): self.forward -= time.time() x = ptu.np_to_var(x, requires_grad=True) current_ob = ptu.np_to_var(current_ob) env_costs = self._env_cost_function(x, current_ob) probabilities = self._feasibility_probabilities(x, current_ob) if self.use_max_cost: not_reached_cost = self.max_cost else: not_reached_cost = ((current_ob[self.goal_slice] - self.desired_features_torch)**2).sum() if verbose: print("---") print("env_costs", env_costs) print("not reached cost", not_reached_cost) print("probabilities", probabilities) if self.only_use_terminal_env_loss: final_prob = torch.prod(probabilities) loss = env_costs * (final_prob + 1) + (1 - final_prob) * not_reached_cost # if verbose: # print("final prob", final_prob) else: """ argmin_s c(s) p(s) + C_max (1-p(s)) = argmin_s (c(s) - C_max) p(s) = argmin_s -log(C_max - c(s)) - log p(s) However, doing the cum-probs thing is better (i.e. it's a tighter lower bound) """ # loss = -torch.log( # self.planning_horizon * not_reached_cost - env_costs # ).sum() - traj_log_prob cum_probs = self._compute_cum_prob(probabilities) loss = env_costs * cum_probs + (1 - cum_probs) * not_reached_cost # if verbose: # print("cum_probs", cum_probs) loss = loss.sum() loss_np = ptu.get_numpy(loss)[0].astype(np.float64) self.forward += time.time() self.backward -= time.time() loss.backward() gradient_np = ptu.get_numpy(x.grad).astype(np.float64) self.backward += time.time() return loss_np, gradient_np
def reward(self, state, action, next_state): rewards_np = self.env.compute_rewards( None, None, ptu.get_numpy(next_state), ptu.get_numpy(self._goal_batch), ) return ptu.np_to_var(rewards_np)
def get_batch(self, train=True): dataset = self.train_dataset if train else self.test_dataset ind = np.random.randint(0, len(dataset), self.batch_size) samples = dataset[ind, :] samples = normalize_image(samples) if self.normalize: samples = ((samples - self.train_data_mean) + 1) / 2 return ptu.np_to_var(samples)