def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net) vf_net = VNet(self.env.observation_space) vf = DeterministicSVfunc(self.env.observation_space, vf_net) discrim_net = DiscrimNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) discrim = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, discrim_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) optim_discrim = torch.optim.Adam(discrim_net.parameters(), 3e-4) with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj.register_epis() epis = sampler.sample(pol, max_steps=32) agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = ef.compute_pseudo_rews(agent_traj, discrim) agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, 0.99) agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() result_dict = gail.train(agent_traj, expert_traj, pol, vf, discrim, optim_vf, optim_discrim, rl_type='trpo', epoch=1, batch_size=32, discrim_batch_size=32, discrim_step=1, pol_ent_beta=1e-3, discrim_ent_beta=1e-5) del sampler
def test_learning(self): pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net) vf_net = VNet(self.env.ob_space) vf = DeterministicSVfunc(self.env.ob_space, vf_net) rewf_net = VNet(self.env.ob_space, h1=32, h2=32) rewf = DeterministicSVfunc(self.env.ob_space, rewf_net) shaping_vf_net = VNet(self.env.ob_space, h1=32, h2=32) shaping_vf = DeterministicSVfunc(self.env.ob_space, shaping_vf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), 3e-4) with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj = ef.add_next_obs(expert_traj) expert_traj.register_epis() epis = sampler.sample(pol, max_steps=32) agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = ef.add_next_obs(agent_traj) agent_traj = ef.compute_pseudo_rews( agent_traj, rew_giver=rewf, state_only=True) agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, 0.99) agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() result_dict = airl.train(agent_traj, expert_traj, pol, vf, optim_vf, optim_discrim, rewf=rewf, shaping_vf=shaping_vf, rl_type='trpo', epoch=1, batch_size=32, discrim_batch_size=32, discrim_step=1, pol_ent_beta=1e-3, gamma=0.99) del sampler
if args.rl_type == 'ppo_kl': kl_beta = args.init_kl_beta if args.pretrain: with measure('bc pretrain'): _ = behavior_clone.train(expert_traj, pol, optim_pol, args.bc_batch_size, args.bc_epoch) while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = ef.compute_pseudo_rews(agent_traj, discrim) agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, args.gamma) agent_traj = ef.compute_advs(agent_traj, args.gamma, args.lam) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() if args.data_parallel: pol.dp_run = True vf.dp_run = True discrim.dp_run = True if args.rl_type == 'trpo': result_dict = gail.train( agent_traj,
if args.pretrain: with measure('bc pretrain'): for _ in range(args.bc_epoch): _ = behavior_clone.train(expert_traj, pol, optim_pol, args.bc_batch_size) while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = ef.add_next_obs(agent_traj) agent_traj = ef.compute_pseudo_rews( agent_traj, rew_giver=rewf if args.rew_type == 'rew' else advf, state_only=True if args.rew_type == 'rew' else False) agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, args.gamma) agent_traj = ef.compute_advs(agent_traj, args.gamma, args.lam) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() if args.rl_type == 'trpo': result_dict = airl.train( agent_traj, expert_traj, pol, vf, optim_vf,
kl_beta = args.init_kl_beta if args.pretrain: with measure('bc pretrain'): for _ in range(args.bc_epoch): _ = behavior_clone.train(expert_traj, pol, optim_pol, args.bc_batch_size) while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = task_oriented_reward( agent_traj, discrim) if args.tr else ef.compute_pseudo_rews( agent_traj, discrim) # TRGAILorGAIL agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, args.gamma) agent_traj = ef.compute_advs(agent_traj, args.gamma, args.lam) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() if args.data_parallel: pol.dp_run = True vf.dp_run = True discrim.dp_run = True if args.rl_type == 'trpo': result_dict = gail.train( agent_traj,