def collect_samples(self, batch_size=1): memory = Memory() num_trajs = (batch_size + args.sample_traj_length - 1) // args.sample_traj_length onehot_state, multihot_state, continuous_state = self.reset(num_trajs) for walk_step in range(self.max_traj_length - 1): with torch.no_grad(): onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step( onehot_state, multihot_state, continuous_state, num_trajs) # Currently we assume the exploration step is not done until it reaches max_traj_length. mask = torch.ones((num_trajs, 1), device=device) memory.push(onehot_state.type(FloatTensor), multihot_state.type(FloatTensor), continuous_state, onehot_action.type(FloatTensor), multihot_action.type(FloatTensor), continuous_action, next_onehot_state.type(FloatTensor), next_multihot_state.type(FloatTensor), next_continuous_state, old_log_prob, mask) onehot_state, multihot_state, continuous_state = next_onehot_state, next_multihot_state, next_continuous_state # one more step for push done with torch.no_grad(): onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step( onehot_state, multihot_state, continuous_state, num_trajs) mask = torch.zeros((num_trajs, 1), device=device) memory.push(onehot_state.type(FloatTensor), multihot_state.type(FloatTensor), continuous_state, onehot_action.type(FloatTensor), multihot_action.type(FloatTensor), continuous_action, next_onehot_state.type(FloatTensor), next_multihot_state.type(FloatTensor), next_continuous_state, old_log_prob, mask) return memory, num_trajs
def collect_samples(self, mini_batch_size, size=1): num_step = 0 memory = Memory() while num_step < mini_batch_size: discrete_state, continuous_state = self.reset(size) for walk_step in range(self.max_traj_length - 1): with torch.no_grad(): discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step( discrete_state, continuous_state, size) # Currently we assume the exploration step is not done until it reaches max_traj_length. mask = torch.ones((size, 1), device=device) memory.push(discrete_state.type(FloatTensor), continuous_state, discrete_action.type(FloatTensor), continuous_action, next_discrete_state.type(FloatTensor), next_continuous_state, old_log_prob, mask) discrete_state, continuous_state = next_discrete_state, next_continuous_state num_step += 1 if num_step >= mini_batch_size: return memory # one more step for push done with torch.no_grad(): discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step( discrete_state, continuous_state, size) mask = torch.zeros((size, 1), device=device) memory.push(discrete_state.type(FloatTensor), continuous_state, discrete_action.type(FloatTensor), continuous_action, next_discrete_state.type(FloatTensor), next_continuous_state, old_log_prob, mask) num_step += 1 return memory
def _rollout_with_memory(self, env, network, args, running_state, max_episode_steps, keep_memory=False): memory = Memory() num_steps = 0 reward_list = [] len_list = [] while num_steps < args.batch_size: state = env.reset() if args.state_norm: state = running_state(state) if args.append_time: state = np.append(state, 1.0) reward_sum = 0 for t in range(max_episode_steps): action_mean, action_std, value = network( Tensor(state).unsqueeze(0)) action_mean = action_mean[0] action_std = action_std[0] action, y = network.select_action(action_mean, action_std) action_mean = action_mean.data.numpy() action = action.data.numpy() y = y.data.numpy() next_state, reward, done, info = env.step(action) reward_sum += reward if args.state_norm: next_state = running_state(next_state) if args.append_time: next_state = np.append(next_state, 1 - (t + 1) / max_episode_steps) mask = 0 if (done or ((t + 1) == max_episode_steps)) else 1 memory.push(state, value, action_mean, action, y, mask, next_state, reward) if done: break state = next_state num_steps += (t + 1) reward_list.append(reward_sum) len_list.append(t + 1) meanepreward = np.mean(reward_list) meaneplen = np.mean(len_list) if keep_memory: self.memory = memory self.old_std = network.action_std.data return meanepreward, meaneplen else: return memory, meanepreward, meaneplen, num_steps