def _train_ppo_epoch(self, full_input): total_obs = self.n_steps * self.envs.n_envs shuffle_idx = np.random.permutation(total_obs) batches = dict_of_lists_to_list_of_dicst({ k: np.split(v[shuffle_idx], total_obs // self.ppo_par.batch_size) for k, v in full_input.items() }) for b in batches: self.agent.train(b)
def _train_ppo_recurrent_epoch(self, full_input, rnn_state): # HE SHUFFLES SO BE CAREFUL!!! RECHECK IT: rnn_state might need to get in the full_input total_obs = self.n_steps * self.envs.num_envs shuffle_idx = np.random.permutation(total_obs) batches = dict_of_lists_to_list_of_dicst({ k: np.split(v[shuffle_idx], total_obs // self.ppo_par.batch_size) for k, v in full_input.items() }) for b in batches: self.agent.train_recurrent(b, rnn_state) # IMPORTANT : όταν κανεις training δεν χρειαζεσαι την rnn_State, ξεκινας απο το 0 και αθτη παιρνη την μορφή πουπρεπει να εχει
def _train_ppo_epoch(self, full_input): total_obs = self.n_steps * self.envs.num_envs shuffle_idx = np.random.permutation(total_obs) batches = dict_of_lists_to_list_of_dicst({ k: np.split(v[shuffle_idx], total_obs // self.ppo_par.batch_size) for k, v in full_input.items() }) if self.policy_type == MetaPolicy: # We take out the if from the loop so you choose trainer BEFORE getting into the batch loop for b in batches: self.agent.train_recurrent(b) else: for b in batches: self.agent.train(b)
def test_dict_list_transpose(self): x = { "a": [1, 2, 3], "b": [np.array([5, 6]), np.array([7, 8]), np.array([90, 100])] } result = dict_of_lists_to_list_of_dicst(x) expected = [ {'a': 1, 'b': np.array([5, 6])}, {'a': 2, 'b': np.array([7, 8])}, {'a': 3, 'b': np.array([90, 100])} ] assert len(result) == len(expected) for r, e in zip(expected, result): assert r.keys() == e.keys() assert r["a"] == e["a"] self.assertAllEqual(r["b"], e["b"])