def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net) vf_net = VNet(self.env.observation_space, h1=32, h2=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 24) del sampler
def test_learning_rnn(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = CategoricalPol( self.env.observation_space, self.env.action_space, pol_net, rnn=True) vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=400) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2) del sampler
epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, args.gamma) traj = ef.compute_advs(traj, args.gamma, args.lam) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = trpo.train(traj, pol, vf, optim_vf, args.epoch_per_iter, batch_size=args.batch_size if not args.rnn else args.rnn_batch_size) total_epi += traj.num_epi step = traj.num_step total_step += step rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step,
max_rew = -1e6 while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, args.gamma) traj = ef.compute_advs(traj, args.gamma, args.lam) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = trpo.train(traj, pol, vf, optim_vf, args.epoch_per_iter, args.batch_size) total_epi += traj.num_epi step = traj.num_step total_step += step rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name)