Пример #1
0
 def __init__(self, data_set_path, num_states, num_actions):
     self.expert_data = np.array(pd.read_csv(data_set_path))
     self.state = FLOAT(self.expert_data[:, :num_states])
     self.action = FLOAT(self.expert_data[:, num_states:num_states +
                                          num_actions])
     self.next_state = FLOAT(self.expert_data[:, num_states + num_actions:])
     self.length = self.state.size(0)
Пример #2
0
 def step(self, action):
     with torch.no_grad():
         self.state = self.model.get_next_state(FLOAT(self.state).to(device).unsqueeze(0),
                                                FLOAT(action).to(device).unsqueeze(0)).numpy()[0]
     self.cur_step += 1
     done = (self.cur_step >= self.max_step)
     reward = self._calc_reward()
     return self.state, reward, done, {}
Пример #3
0
    def update(self, batch):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by DDPG
        ddpg_step(self.policy_net, self.policy_net_target, self.value_net,
                  self.value_net_target, self.optimizer_p, self.optimizer_v,
                  batch_state, batch_action, batch_reward, batch_next_state,
                  batch_mask, self.gamma, self.polyak)
Пример #4
0
    def update(self, batch, k_iter):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by SAC Alpha
        sac_alpha_step(self.policy_net, self.q_net_1, self.q_net_2, self.alpha, self.q_net_target_1,
                       self.q_net_target_2,
                       self.optimizer_p, self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a, batch_state,
                       batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak,
                       self.target_entropy,
                       k_iter % self.target_update_delay == 0)
Пример #5
0
 def choose_action(self, state):
     """select action"""
     state = FLOAT(state).unsqueeze(0).to(device)
     with torch.no_grad():
         action, _ = self.policy_net.get_action_log_prob(state)
     action = action.cpu().numpy()[0]
     return action, None
Пример #6
0
 def choose_action(self, state, noise_scale):
     """select action"""
     self.policy_net.eval()
     state = FLOAT(state).unsqueeze(0).to(device)
     with torch.no_grad():
         action = self.policy_net(state)
     self.policy_net.train()
     action = action.cpu().numpy()[0]
     # add noise
     noise = noise_scale * np.random.randn(self.num_actions)
     action += noise
     action = np.clip(action, -self.action_high, self.action_high)
     return action
Пример #7
0
class ExpertDataSet(Dataset):
    def __init__(self, data_set_path, num_states, num_actions):
        self.expert_data = np.array(pd.read_csv(data_set_path))
        self.state = FLOAT(self.expert_data[:, :num_states])
        self.action = FLOAT(self.expert_data[:, num_states:num_states +
                                             num_actions])
        self.next_state = FLOAT(self.expert_data[:, num_states + num_actions:])
        self.length = self.state.size(0)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.state[idx], self.action[idx]
Пример #8
0
def estimate_advantages(rewards, masks, values, gamma, tau, trajectory_length):
    """
    General advantage estimate
    :param rewards: [trajectory length * parallel size, 1]
    :param masks: [trajectory length * parallel size, 1]
    :param values: [trajectory length * parallel size, 1]
    :param gamma:
    :param tau:
    :param trajectory_length: the length of trajectory
    :return:
    """
    trans_shape_func = lambda x: x.reshape(trajectory_length, -1, 1)
    rewards = trans_shape_func(
        rewards)  # [trajectory length, parallel size, 1]
    masks = trans_shape_func(masks)  # [trajectory length, parallel size, 1]
    values = trans_shape_func(values)  # [trajectory length, parallel size, 1]

    deltas = FLOAT(rewards.size()).to(device)
    advantages = FLOAT(rewards.size()).to(device)

    # calculate advantages in parallel
    prev_value = torch.zeros((rewards.size(1), 1), device=device)
    prev_advantage = torch.zeros((rewards.size(1), 1), device=device)

    for i in reversed(range(rewards.size(0))):
        deltas[i, ...] = rewards[
            i, ...] + gamma * prev_value * masks[i, ...] - values[i, ...]
        advantages[i, ...] = deltas[
            i, ...] + gamma * tau * prev_advantage * masks[i, ...]

        prev_value = values[i, ...]
        prev_advantage = advantages[i, ...]

    returns = values + advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10)

    # reverse shape for ppo
    return advantages.reshape(-1, 1), returns.reshape(
        -1, 1)  # [trajectory length * parallel size, 1]