def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f: expert_epis = pickle.load(f) train_epis, test_epis = ef.train_test_split( expert_epis, train_size=0.7) train_traj = Traj() train_traj.add_epis(train_epis) train_traj.register_epis() test_traj = Traj() test_traj.add_epis(test_epis) test_traj.register_epis() result_dict = behavior_clone.train( train_traj, pol, optim_pol, 256 ) del sampler
expert_traj = ef.add_next_obs(expert_traj) expert_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards) logger.log('expert_score={}'.format(expert_mean_rew)) logger.log('expert_num_epi={}'.format(expert_traj.num_epi)) total_epi = 0 total_step = 0 max_rew = -1e6 kl_beta = args.init_kl_beta if args.pretrain: with measure('bc pretrain'): for _ in range(args.bc_epoch): _ = behavior_clone.train(expert_traj, pol, optim_pol, args.bc_batch_size) torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_bc.pkl')) while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, args.gamma) traj = ef.compute_advs(traj, args.gamma, args.lam) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj)
train_traj.register_epis() test_traj = Traj() test_traj.add_epis(test_epis) test_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards) logger.log('expert_score={}'.format(expert_mean_rew)) logger.log('num_train_epi={}'.format(train_traj.num_epi)) max_rew = -1e6 for curr_epoch in range(args.epoch): if args.data_parallel: pol.dp_run = True result_dict = behavior_clone.train(train_traj, pol, optim_pol, args.batch_size) test_result_dict = behavior_clone.test(test_traj, pol) if args.data_parallel: pol.dp_run = False for key in test_result_dict.keys(): result_dict[key] = test_result_dict[key] if curr_epoch % int( args.check_rate * args.epoch) == 0 or curr_epoch == 0: with measure('sample'): paths = sampler.sample(pol, max_epis=args.max_epis_per_iter) rewards = [np.sum(path['rews']) for path in paths] mean_rew = np.mean([np.sum(path['rews']) for path in paths]) logger.record_results_bc(args.log,