示例#1
0
    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space, h1=32, h2=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)

        with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f:
            expert_epis = pickle.load(f)
        train_epis, test_epis = ef.train_test_split(
            expert_epis, train_size=0.7)
        train_traj = Traj()
        train_traj.add_epis(train_epis)
        train_traj.register_epis()
        test_traj = Traj()
        test_traj.add_epis(test_epis)
        test_traj.register_epis()

        result_dict = behavior_clone.train(
            train_traj, pol, optim_pol,
            256
        )

        del sampler
                         pol_net,
                         data_parallel=args.data_parallel)
elif isinstance(action_space, gym.spaces.MultiDiscrete):
    pol = MultiCategoricalPol(observation_space,
                              action_space,
                              pol_net,
                              data_parallel=args.data_parallel)
else:
    raise ValueError('Only Box, Discrete, and MultiDiscrete are supported')

sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)

with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f:
    expert_epis = pickle.load(f)
train_epis, test_epis = ef.train_test_split(expert_epis,
                                            train_size=args.train_size)
train_traj = Traj()
train_traj.add_epis(train_epis)
train_traj.register_epis()
test_traj = Traj()
test_traj.add_epis(test_epis)
test_traj.register_epis()
expert_rewards = [np.sum(epi['rews']) for epi in expert_epis]
expert_mean_rew = np.mean(expert_rewards)
logger.log('expert_score={}'.format(expert_mean_rew))
logger.log('num_train_epi={}'.format(train_traj.num_epi))

max_rew = -1e6

for curr_epoch in range(args.epoch):
    if args.data_parallel: