示例#1
0
 def collect_samples(self, batch_size=1):
     memory = Memory()
     num_trajs = (batch_size + args.sample_traj_length -
                  1) // args.sample_traj_length
     onehot_state, multihot_state, continuous_state = self.reset(num_trajs)
     for walk_step in range(self.max_traj_length - 1):
         with torch.no_grad():
             onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step(
                 onehot_state, multihot_state, continuous_state, num_trajs)
         # Currently we assume the exploration step is not done until it reaches max_traj_length.
         mask = torch.ones((num_trajs, 1), device=device)
         memory.push(onehot_state.type(FloatTensor),
                     multihot_state.type(FloatTensor), continuous_state,
                     onehot_action.type(FloatTensor),
                     multihot_action.type(FloatTensor), continuous_action,
                     next_onehot_state.type(FloatTensor),
                     next_multihot_state.type(FloatTensor),
                     next_continuous_state, old_log_prob, mask)
         onehot_state, multihot_state, continuous_state = next_onehot_state, next_multihot_state, next_continuous_state
     # one more step for push done
     with torch.no_grad():
         onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step(
             onehot_state, multihot_state, continuous_state, num_trajs)
         mask = torch.zeros((num_trajs, 1), device=device)
         memory.push(onehot_state.type(FloatTensor),
                     multihot_state.type(FloatTensor), continuous_state,
                     onehot_action.type(FloatTensor),
                     multihot_action.type(FloatTensor), continuous_action,
                     next_onehot_state.type(FloatTensor),
                     next_multihot_state.type(FloatTensor),
                     next_continuous_state, old_log_prob, mask)
     return memory, num_trajs
示例#2
0
 def collect_samples(self, mini_batch_size, size=1):
     num_step = 0
     memory = Memory()
     while num_step < mini_batch_size:
         discrete_state, continuous_state = self.reset(size)
         for walk_step in range(self.max_traj_length - 1):
             with torch.no_grad():
                 discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step(
                     discrete_state, continuous_state, size)
             # Currently we assume the exploration step is not done until it reaches max_traj_length.
             mask = torch.ones((size, 1), device=device)
             memory.push(discrete_state.type(FloatTensor), continuous_state,
                         discrete_action.type(FloatTensor),
                         continuous_action,
                         next_discrete_state.type(FloatTensor),
                         next_continuous_state, old_log_prob, mask)
             discrete_state, continuous_state = next_discrete_state, next_continuous_state
             num_step += 1
             if num_step >= mini_batch_size:
                 return memory
         # one more step for push done
         with torch.no_grad():
             discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step(
                 discrete_state, continuous_state, size)
             mask = torch.zeros((size, 1), device=device)
             memory.push(discrete_state.type(FloatTensor), continuous_state,
                         discrete_action.type(FloatTensor),
                         continuous_action,
                         next_discrete_state.type(FloatTensor),
                         next_continuous_state, old_log_prob, mask)
             num_step += 1
     return memory
示例#3
0
    def _rollout_with_memory(self,
                             env,
                             network,
                             args,
                             running_state,
                             max_episode_steps,
                             keep_memory=False):
        memory = Memory()
        num_steps = 0
        reward_list = []
        len_list = []
        while num_steps < args.batch_size:
            state = env.reset()
            if args.state_norm:
                state = running_state(state)
            if args.append_time:
                state = np.append(state, 1.0)
            reward_sum = 0
            for t in range(max_episode_steps):
                action_mean, action_std, value = network(
                    Tensor(state).unsqueeze(0))
                action_mean = action_mean[0]
                action_std = action_std[0]
                action, y = network.select_action(action_mean, action_std)
                action_mean = action_mean.data.numpy()
                action = action.data.numpy()
                y = y.data.numpy()
                next_state, reward, done, info = env.step(action)
                reward_sum += reward
                if args.state_norm:
                    next_state = running_state(next_state)
                if args.append_time:
                    next_state = np.append(next_state,
                                           1 - (t + 1) / max_episode_steps)
                mask = 0 if (done or ((t + 1) == max_episode_steps)) else 1
                memory.push(state, value, action_mean, action, y, mask,
                            next_state, reward)

                if done:
                    break

                state = next_state

            num_steps += (t + 1)
            reward_list.append(reward_sum)
            len_list.append(t + 1)

            meanepreward = np.mean(reward_list)
            meaneplen = np.mean(len_list)

        if keep_memory:
            self.memory = memory
            self.old_std = network.action_std.data
            return meanepreward, meaneplen
        else:
            return memory, meanepreward, meaneplen, num_steps