Пример #1
0
 def test_trajectory_data(self):
     env = make('Acrobot-v1')
     env_spec = EnvSpec(obs_space=env.observation_space,
                        action_space=env.action_space)
     a = TrajectoryData(env_spec)
     tmp_traj = TransitionData(env_spec)
     st = env.reset()
     re_list = []
     st_list = []
     for i in range(100):
         ac = env_spec.action_space.sample()
         st_new, re, done, _ = env.step(action=ac)
         st_list.append(st_new)
         re_list.append(re)
         if (i + 1) % 10 == 0:
             done = True
         else:
             done = False
         tmp_traj.append(state=st,
                         new_state=st_new,
                         action=ac,
                         done=done,
                         reward=re)
         if done is True:
             a.append(tmp_traj)
             tmp_traj.reset()
     self.assertEqual(a.trajectories.__len__(), 10)
     for traj in a.trajectories:
         self.assertEqual(len(traj), 10)
     data = a.return_as_transition_data()
     data_gen = data.return_generator()
     for d, re, st in zip(data_gen, re_list, st_list):
         self.assertEqual(d[3], re)
         self.assertTrue(np.equal(st, d[1]).all())
Пример #2
0
    def train(self,
              trajectory_data: TrajectoryData = None,
              train_iter=None,
              sess=None) -> dict:
        super(PPO, self).train()
        if trajectory_data is None:
            trajectory_data = self.trajectory_memory
        if len(trajectory_data) == 0:
            raise MemoryBufferLessThanBatchSizeError(
                'not enough trajectory data')
        tf_sess = sess if sess else tf.get_default_session()
        SampleProcessor.add_estimated_v_value(trajectory_data,
                                              value_func=self.value_func)
        SampleProcessor.add_discount_sum_reward(trajectory_data,
                                                gamma=self.parameters('gamma'))
        SampleProcessor.add_gae(trajectory_data,
                                gamma=self.parameters('gamma'),
                                name='advantage_set',
                                lam=self.parameters('lam'),
                                value_func=self.value_func)

        train_data = trajectory_data.return_as_transition_data(
            shuffle_flag=False)
        SampleProcessor.normalization(train_data, key='advantage_set')
        policy_res_dict = self._update_policy(
            train_data=train_data,
            train_iter=train_iter
            if train_iter else self.parameters('policy_train_iter'),
            sess=tf_sess)
        value_func_res_dict = self._update_value_func(
            train_data=train_data,
            train_iter=train_iter
            if train_iter else self.parameters('value_func_train_iter'),
            sess=tf_sess)
        self.trajectory_memory.reset()
        return {**policy_res_dict, **value_func_res_dict}