def _retrieve_batch(self, batch_size, chunk_size): sampled_game_idx = self._sample_game_idx(batch_size) observations = [] actions = [] rewards = [] nonterminals = [] for idx in sampled_game_idx: _observations, _actions, _rewards, _nonterminals = self._retrieve_game( idx, chunk_size) observations.append(_observations) actions.append(_actions) rewards.append(_rewards) nonterminals.append(_nonterminals) observations = torch.as_tensor(np.array(observations, dtype=np.float32)) if not self.symbolic_env: preprocess_observation_( observations, self.bit_depth) # Undo discretisation for visual observations observations = observations.reshape(chunk_size, batch_size, *observations.shape[-3:]) actions = np.array(actions).reshape(chunk_size, batch_size, -1) rewards = np.array(rewards).reshape(chunk_size, batch_size) nonterminals = np.array(nonterminals).reshape(chunk_size, batch_size) return observations, actions, rewards, nonterminals
def _retrieve_batch(self, idxs, n, L): vec_idxs = idxs.transpose().reshape(-1) # Unroll indices observations = torch.as_tensor(self.observations[vec_idxs].astype( np.float32)) if not self.symbolic_env: # Undo discretisation for visual observations preprocess_observation_(observations, self.bit_depth) return observations.reshape( L, n, *observations.shape[1:]), self.actions[vec_idxs].reshape( L, n, -1), self.rewards[vec_idxs].reshape( L, n), self.nonterminals[vec_idxs].reshape(L, n, 1)
def _retrieve_batch(self, idxs, n, L): vec_idxs = idxs.transpose().reshape(-1) # Unroll indices obs_ = self.observations[vec_idxs].astype(np.float32) # next_obs_ = self.next_observations[vec_idxs].astype(np.float32) # obs_aug = obs_.copy() # next_obs_aug = next_obs_.copy() # Undo discretisation for visual observations observations = torch.as_tensor(obs_) observations = preprocess_observation_(observations, self.bit_depth).to(self.device) # next_observations = torch.as_tensor(next_obs_) # next_observations = preprocess_observation_( # next_observations, self.bit_depth).to(self.device) # I think we need to preserve the original observations for recontructions # observations0 = self.aug_trans(observations) # next_observations0 = self.aug_trans(next_observations) # observations_aug = observations.clone() observations_aug0 = self.aug_trans(observations) observations_aug1 = self.aug_trans(observations) # next_observations_aug = next_observations.clone() # next_observations_aug = self.aug_trans(next_observations_aug) return (observations.reshape(L, n, *observations.shape[1:]), self.actions[vec_idxs].reshape(L, n, -1), self.rewards[vec_idxs].reshape(L, n), self.nonterminals[vec_idxs].reshape(L, n, 1), observations_aug0.reshape(L, n, *observations_aug0.shape[1:]), observations_aug1.reshape(L, n, *observations_aug1.shape[1:]))
D.append(observation, action, reward, done) observation = next_observation t += 1 metrics['steps'].append(t * args.action_repeat + ( 0 if len(metrics['steps']) == 0 else metrics['steps'][-1])) metrics['episodes'].append(s) if args.experience_list: from torch.nn import functional as F elst = torch.load(args.experience_list) done_cnt = 0 for (obs_, action, reward, done) in elst: if done: done_cnt += 1 print(f"Loading {done_cnt}") observation_ = torch.from_numpy(obs_.astype(np.float32)) preprocess_observation_(observation_, args.bit_depth) action = torch.tensor(action) # print(action,reward,done,obs_.shape) # act_=F.one_hot(idx_, env.action_size).float() D.append(observation_, action, reward, done) if done_cnt == 3: break # Initialise model parameters randomly transition_model = TransitionModel(args.belief_size, args.state_size, env.action_size, args.hidden_size, args.embedding_size, args.dense_activation_function).to(device) observation_model = ObservationModel(env.observation_size, args.belief_size, args.state_size, args.embedding_size, args.cnn_activation_function,