def run_experiment(algo, policy, env_fn, args, normalizer=None, log=True, monitor=False, render=False): logger = Logger(args, viz=monitor) if log else None # HOTFIX for Patrick's desktop: (MP is buggy on it for some reason) if render: policy.share_memory() train_p = mp.Process(target=algo.train, args=(env_fn, policy, args.n_itr, normalizer), kwargs=dict(logger=logger)) train_p.start() # TODO: add normalize as a commandline argument renv_fn = partial(env_fn) renv = Normalize(Vectorize([renv_fn])) render_p = mp.Process(target=renderloop, args=(renv, policy)) render_p.start() train_p.join() render_p.join() else: print("logger: ", logger) algo.train(env_fn, policy, args.n_itr, normalizer, logger=logger)
def _sample(self, env_fn, policy, min_steps, max_traj_len, deterministic=False): """ Sample at least min_steps number of total timesteps, truncating trajectories only if they exceed max_traj_len number of timesteps """ env = Vectorize([env_fn]) memory = PPOBuffer(self.gamma, self.lam) num_steps = 0 while num_steps < min_steps: state = torch.Tensor(env.reset()) done = False value = 0 traj_len = 0 while not done and traj_len < max_traj_len: value, action = policy.act(state, deterministic) next_state, reward, done, _ = env.step(action.numpy()) memory.store(state.numpy(), action.numpy(), reward, value.numpy()) state = torch.Tensor(next_state) traj_len += 1 num_steps += 1 value, _ = policy.act(state) memory.finish_path(last_val=(not done) * value.numpy()) return memory
# TODO: add command line arguments for normalization on/off, and for ensemble policy? if __name__ == "__main__": torch.set_num_threads( 1) # see: https://github.com/pytorch/pytorch/issues/13757 if args.new: env_fn = make_env_fn(state_est=args.state_est) # env_fn = make_cassie_env("walking", clock_based=True) # env_fn = functools.partial(CassieEnv_speed, "walking", clock_based=True, state_est=False) # env_fn = functools.partial(CassieEnv_nodelta, "walking", clock_based=True, state_est=False) # env_fn = functools.partial(CassieEnv_speed_dfreq, "walking", clock_based = True, state_est=args.state_est) env = Vectorize([env_fn]) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] policy = GaussianMLP(obs_dim, action_dim, nonlinearity="relu", init_std=np.exp(-2), learn_std=False) # policy2 = ActorCriticNet(obs_dim, action_dim, [256, 256]) # #print(policy, sum(p.numel() for p in policy.parameters())) # #print(policy2, sum(p.numel() for p in policy2.parameters()))
def train(self, env_fn, policy, n_itr, normalize=None, logger=None): if normalize != None: policy.train() else: policy.train(0) env = Vectorize([env_fn]) # this will be useful for parallelism later if normalize is not None: env = normalize(env) mean, std = env.ob_rms.mean, np.sqrt(env.ob_rms.var + 1E-8) policy.obs_mean = torch.Tensor(mean) policy.obs_std = torch.Tensor(std) policy.train(0) env = Vectorize([env_fn]) old_policy = deepcopy(policy) optimizer = optim.SGD(policy.parameters(), lr=self.lr) start_time = time.time() for itr in range(n_itr): print("********** Iteration {} ************".format(itr)) sample_t = time.time() if self.n_proc > 1: print("doing multi samp") batch = self.sample_parallel(env_fn, policy, self.num_steps, 300) else: batch = self._sample(env_fn, policy, self.num_steps, 300) #TODO: fix this print("sample time: {:.2f} s".format(time.time() - sample_t)) observations, actions, returns, values = map( torch.Tensor, batch.get()) advantages = returns - values advantages = (advantages - advantages.mean()) / (advantages.std() + self.eps) minibatch_size = self.minibatch_size or advantages.numel() print("timesteps in batch: %i" % advantages.numel()) old_policy.load_state_dict( policy.state_dict()) # WAY faster than deepcopy for _ in range(self.epochs): losses = [] sampler = BatchSampler(SubsetRandomSampler( range(advantages.numel())), minibatch_size, drop_last=True) for indices in sampler: indices = torch.LongTensor(indices) obs_batch = observations[indices] action_batch = actions[indices] return_batch = returns[indices] advantage_batch = advantages[indices] values, pdf = policy.evaluate(obs_batch) # TODO, move this outside loop? with torch.no_grad(): _, old_pdf = old_policy.evaluate(obs_batch) old_log_probs = old_pdf.log_prob(action_batch).sum( -1, keepdim=True) log_probs = pdf.log_prob(action_batch).sum(-1, keepdim=True) ratio = (log_probs - old_log_probs).exp() cpi_loss = ratio * advantage_batch clip_loss = ratio.clamp(1.0 - self.clip, 1.0 + self.clip) * advantage_batch actor_loss = -torch.min(cpi_loss, clip_loss).mean() critic_loss = 0.5 * (return_batch - values).pow(2).mean() entropy_penalty = -self.entropy_coeff * pdf.entropy().mean( ) # TODO: add ability to optimize critic and actor seperately, with different learning rates optimizer.zero_grad() (actor_loss + critic_loss + entropy_penalty).backward() optimizer.step() # Do adaptive step size to satisfy KL div threshold with torch.no_grad(): _, pdf = policy.evaluate(obs_batch) curr_lr = self.lr while kl_divergence(pdf, old_pdf).mean() > 0.02: curr_lr /= 2 self.update_lr(optimizer, curr_lr) policy.load_state_dict(old_policy.state_dict()) optimizer.step() with torch.no_grad(): _, pdf = policy.evaluate(obs_batch) if curr_lr != self.lr: print( "KL div threshold violated, changed step size to ", curr_lr) losses.append([ actor_loss.item(), pdf.entropy().mean().item(), critic_loss.item(), ratio.mean().item() ]) # TODO: add verbosity arguments to suppress this print(' '.join(["%g" % x for x in np.mean(losses, axis=0)])) if logger is not None: test = self.sample(env, policy, 800 // self.n_proc, 400, deterministic=True) _, pdf = policy.evaluate(observations) _, old_pdf = old_policy.evaluate(observations) entropy = pdf.entropy().mean().item() kl = kl_divergence(pdf, old_pdf).mean().item() logger.record("Return (test)", np.mean(test.ep_returns)) logger.record("Return (batch)", np.mean(batch.ep_returns)) logger.record("Mean Eplen", np.mean(batch.ep_lens)) logger.record("Mean KL Div", kl) logger.record("Mean Entropy", entropy) logger.dump() # TODO: add option for how often to save model # if itr % 10 == 0: if np.mean(test.ep_returns) > self.max_return: self.max_return = np.mean(test.ep_returns) self.save(policy, env) self.save_optim(optimizer) print("Total time: {:.2f} s".format(time.time() - start_time))
def train(self, env_fn, policy, n_itr, normalize=None, logger=None): if normalize != None: policy.train() else: policy.train(0) env = Vectorize([env_fn]) # this will be useful for parallelism later if normalize is not None: env = normalize(env) mean, std = env.ob_rms.mean, np.sqrt(env.ob_rms.var + 1E-8) policy.obs_mean = torch.Tensor(mean) policy.obs_std = torch.Tensor(std) policy.train(0) old_policy = deepcopy(policy) optimizer = optim.Adam(policy.parameters(), lr=self.lr, eps=self.eps) start_time = time.time() for itr in range(n_itr): print("********** Iteration {} ************".format(itr)) sample_start = time.time() batch = self.sample_parallel(env_fn, policy, self.num_steps, self.max_traj_len) print("time elapsed: {:.2f} s".format(time.time() - start_time)) print("sample time elapsed: {:.2f} s".format(time.time() - sample_start)) observations, actions, returns, values = map(torch.Tensor, batch.get()) advantages = returns - values advantages = (advantages - advantages.mean()) / (advantages.std() + self.eps) minibatch_size = self.minibatch_size or advantages.numel() print("timesteps in batch: %i" % advantages.numel()) old_policy.load_state_dict(policy.state_dict()) # WAY faster than deepcopy optimizer_start = time.time() self.update(policy, old_policy, optimizer, observations, actions, returns, advantages, env_fn) print("optimizer time elapsed: {:.2f} s".format(time.time() - optimizer_start)) if logger is not None: evaluate_start = time.time() test = self.sample_parallel(env_fn, policy, 800 // self.n_proc, self.max_traj_len, deterministic=True) print("evaluate time elapsed: {:.2f} s".format(time.time() - evaluate_start)) _, pdf = policy.evaluate(observations) _, old_pdf = old_policy.evaluate(observations) entropy = pdf.entropy().mean().item() kl = kl_divergence(pdf, old_pdf).mean().item() logger.record("Return (test)", np.mean(test.ep_returns)) logger.record("Return (batch)", np.mean(batch.ep_returns)) logger.record("Mean Eplen", np.mean(batch.ep_lens)) logger.record("Mean KL Div", kl) logger.record("Mean Entropy", entropy) logger.dump() # TODO: add option for how often to save model if np.mean(test.ep_returns) > self.max_return: self.max_return = np.mean(test.ep_returns) self.save(policy, env) print("Total time: {:.2f} s".format(time.time() - start_time))