Exemplo n.º 1
0
    def random_iterator(self, batch_size, train_val_split_ratio=0.2):
        history_states = np.array(self.history_states)
        history_actions = np.array(self.history_actions)
        states = np.array(self.states)
        actions = np.array(self.actions)

        input_tuple = (history_states, history_actions, states, actions)

        output_tuple = train_test_split(*input_tuple,
                                        test_size=train_val_split_ratio)

        train_tuple = output_tuple[0::2]
        val_tuple = output_tuple[1::2]

        # in training, we drop last batch to avoid batch size 1 that may crash batch_norm layer.
        train_data_loader = create_data_loader(train_tuple,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               drop_last=True)
        val_data_loader = create_data_loader(val_tuple,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             drop_last=False)

        return train_data_loader, val_data_loader
Exemplo n.º 2
0
    def random_iterator(self,
                        batch_size,
                        train_val_split_ratio=0.2,
                        window_length=None):
        """ Create iterator over (s, a, s', r, d)

        Args:
            batch_size: batch size

        Returns:

        """

        input_tuple = self._create_state_action_next_state(
            window_length=window_length)
        output_tuple = train_test_split(*input_tuple,
                                        test_size=train_val_split_ratio)
        train_tuple = output_tuple[0::2]
        val_tuple = output_tuple[1::2]
        train_data_loader = create_data_loader(train_tuple,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               drop_last=False)
        val_data_loader = create_data_loader(val_tuple,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             drop_last=False)
        return train_data_loader, val_data_loader
Exemplo n.º 3
0
    def random_iterator(self, batch_size, train_val_split_ratio=0.2):
        """Create an iterator for the whole transitions

        Returns:

        """
        input_tuple = (self._obs_storage, self._action_storage,
                       self._next_obs_storage, self._reward_storage,
                       self._done_storage)

        output_tuple = train_test_split(*input_tuple,
                                        test_size=train_val_split_ratio)

        train_tuple = output_tuple[0::2]
        val_tuple = output_tuple[1::2]

        train_data_loader = create_data_loader(train_tuple,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               drop_last=False)
        val_data_loader = create_data_loader(val_tuple,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             drop_last=False)

        return train_data_loader, val_data_loader
Exemplo n.º 4
0
    def predict(self, x, batch_size, verbose=False):
        self.model.eval()
        if not isinstance(x, tuple):
            x = (x,)

        data_loader = create_data_loader(x, batch_size=batch_size, shuffle=False, drop_last=False)
        if verbose:
            data_loader = tqdm(data_loader, desc='Predicting')
        outputs = []
        with torch.no_grad():
            for data in data_loader:
                current_outputs = self.model.forward(*data)
                if not isinstance(current_outputs, tuple):
                    current_outputs = [current_outputs]

                if len(outputs) == 0:
                    for current_output in current_outputs:
                        outputs.append([current_output])
                else:
                    for i, current_output in enumerate(current_outputs):
                        outputs[i].append(current_output)

            for i, output in enumerate(outputs):
                outputs[i] = torch.cat(output, dim=0).cpu().numpy()

        return outputs
Exemplo n.º 5
0
    def random_iterator(self, batch_size, train_val_split_ratio=0.2):
        states = []
        actions = []
        next_states = []
        rewards = []
        dones = []
        for trajectory in self.memory:
            for i in range(self.window_length, trajectory.state.shape[0]):
                states.append(trajectory.state[i - self.window_length:i])
                next_states.append(trajectory.state[i])
                actions.append(trajectory.action[i - self.window_length:i])
            rewards.append(trajectory.reward[self.window_length - 1:])
            done = [False
                    ] * (trajectory.action.shape[0] - self.window_length + 1)
            done[-1] = True
            dones.append(np.array(done))

        states = np.stack(states, axis=0)
        actions = np.stack(actions, axis=0)
        next_states = np.stack(next_states, axis=0)
        rewards = np.concatenate(rewards, axis=0)
        dones = np.concatenate(dones, axis=0)

        input_tuple = (states, actions, next_states, rewards, dones)

        output_tuple = train_test_split(*input_tuple,
                                        test_size=train_val_split_ratio)

        train_tuple = output_tuple[0::2]
        val_tuple = output_tuple[1::2]

        train_data_loader = create_data_loader(train_tuple,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               drop_last=False)
        val_data_loader = create_data_loader(val_tuple,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             drop_last=False)

        return train_data_loader, val_data_loader
Exemplo n.º 6
0
    def random_iterator(self, batch_size, train_val_split_ratio=0.2):
        states = []
        actions = []
        rewards = []
        next_states = []
        dones = []
        for trajectory in self.memory:
            states.append(trajectory.state[:-1])
            actions.append(trajectory.action)
            next_states.append(trajectory.state[1:])
            rewards.append(trajectory.reward)
            done = [False] * trajectory.action.shape[0]
            done[-1] = True
            dones.append(np.array(done))

        states = np.concatenate(states, axis=0)
        actions = np.concatenate(actions, axis=0)
        next_states = np.concatenate(next_states, axis=0)
        rewards = np.concatenate(rewards, axis=0)
        dones = np.concatenate(dones, axis=0)

        input_tuple = (states, actions, next_states, rewards, dones)

        output_tuple = train_test_split(*input_tuple,
                                        test_size=train_val_split_ratio)

        train_tuple = output_tuple[0::2]
        val_tuple = output_tuple[1::2]

        # in training, we drop last batch to avoid batch size 1 that may crash batch_norm layer.
        train_data_loader = create_data_loader(train_tuple,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               drop_last=True)
        val_data_loader = create_data_loader(val_tuple,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             drop_last=False)

        return train_data_loader, val_data_loader
Exemplo n.º 7
0
 def predict_log_prob_batch(self, state, action):
     data_loader = create_data_loader((state, action),
                                      batch_size=32,
                                      shuffle=False,
                                      drop_last=False)
     log_probs = []
     for obs, action in data_loader:
         obs = move_tensor_to_gpu(obs)
         action = move_tensor_to_gpu(action)
         action_distribution = self.policy_net.forward_action(obs)
         log_probs.append(action_distribution.log_prob(action))
     log_probs = torch.cat(log_probs, dim=0).cpu().numpy()
     return log_probs
Exemplo n.º 8
0
    def compute_old_log_prob(self, observation, hidden, actions):
        with torch.no_grad():
            data_loader = create_data_loader((observation, hidden, actions),
                                             batch_size=32,
                                             shuffle=False,
                                             drop_last=False)
            old_log_prob = []
            for obs, hid, ac in data_loader:
                obs = move_tensor_to_gpu(obs)
                hid = move_tensor_to_gpu(hid)
                ac = move_tensor_to_gpu(ac)
                old_distribution, _, _ = self.policy_net.forward(obs, hid)
                old_log_prob.append(old_distribution.log_prob(ac))

            old_log_prob = torch.cat(old_log_prob, dim=0).cpu()
        return old_log_prob
Exemplo n.º 9
0
    def predict_state_value_batch(self, state):
        """ compute the state value using nn baseline

        Args:
            state: (batch_size, ob_dim)

        Returns: (batch_size,)

        """
        data_loader = create_data_loader((state, ),
                                         batch_size=32,
                                         shuffle=False,
                                         drop_last=False)
        values = []
        for obs in data_loader:
            obs = move_tensor_to_gpu(obs[0])
            values.append(self.policy_net.forward_value(obs))
        values = torch.cat(values, dim=0).cpu().numpy()
        return values
Exemplo n.º 10
0
    def random_iterator(self, batch_size):
        """Create an iterator of all the dataset and update value mean and std


        Args:
            batch_size:

        Returns:

        """
        states = np.concatenate(
            [trajectory.state for trajectory in self.memory], axis=0)
        actions = np.concatenate(
            [trajectory.action for trajectory in self.memory], axis=0)
        reward_to_go = np.concatenate(
            [trajectory.reward_to_go for trajectory in self.memory], axis=0)
        gaes = np.concatenate(
            [trajectory.advantage for trajectory in self.memory], axis=0)
        old_log_prob = np.concatenate(
            [trajectory.old_log_prob for trajectory in self.memory], axis=0)

        value_mean, value_std = np.mean(reward_to_go), np.std(reward_to_go)
        reward_to_go = normalize(reward_to_go, value_mean, value_std)

        self.running_value_mean = self.running_value_mean * self.alpha + value_mean * (
            1 - self.alpha)
        self.running_value_std = self.running_value_std * self.alpha + value_std * (
            1 - self.alpha)

        gaes = normalize(gaes, np.mean(gaes), np.std(gaes))

        batch_size = min(batch_size, states.shape[0])

        data_loader = create_data_loader(
            (states, actions, reward_to_go, gaes, old_log_prob),
            batch_size=batch_size,
            shuffle=True,
            drop_last=True)

        return data_loader
Exemplo n.º 11
0
    def forward(self, input):
        batch_size = input.shape[0]
        mean = self.model.forward(input)
        dis = torch.distributions.Normal(mean, torch.exp(self.logstd))
        return dis.rsample(torch.Size([batch_size]))


if __name__ == '__main__':
    x_mean = [0., 1.5]
    x_std = [1, 0]
    y_mean = [-1.5, -0.2]
    y_std = [0.5, 0.1]
    x_train, y_train, x_val, y_val = generate_training_data(
        x_mean, x_std, y_mean, y_std)

    print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

    model = Policy(2, 2)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    regressor = Regressor(model, optimizer, criterion, scheduler=None)

    train_loader = create_data_loader((x_train, y_train))
    val_loader = create_data_loader((x_val, y_val))

    regressor.train(epoch=100,
                    train_data_loader=train_loader,
                    val_data_loader=val_loader,
                    checkpoint_path=None)
Exemplo n.º 12
0
def compute_reward_to_go_gae(paths, gamma, policy_net, lam, value_mean,
                             value_std):
    rewards = []
    gaes = []
    for path in paths:
        # compute last state value
        if path['mask'][-1] == 1:
            with torch.no_grad():
                last_obs = convert_numpy_to_tensor(
                    np.expand_dims(path['last_obs'], axis=0)).type(FloatTensor)
                last_hidden = convert_numpy_to_tensor(
                    np.expand_dims(path['last_hidden'],
                                   axis=0)).type(FloatTensor)
                last_state_value = policy_net.forward(
                    last_obs, last_hidden)[-1].cpu().numpy()[0]
                last_state_value = last_state_value * value_std + value_mean
        else:
            last_state_value = 0.

        # we need to clip last_state_value by (max_abs_value / (1 - gamma))
        # Otherwise, large state value would cause positive feedback loop and cause the reward to explode.
        max_abs_value = np.max(np.abs(path['reward']))
        last_state_value = np.clip(last_state_value,
                                   a_min=-max_abs_value / (1 - gamma),
                                   a_max=max_abs_value / (1 - gamma))

        # calculate reward-to-go
        path['reward'].append(last_state_value)
        current_rewards = discount(path['reward'], gamma).astype(np.float32)

        rewards.append(current_rewards[:-1])

        # compute gae
        with torch.no_grad():
            observation = path['observation']
            hidden = path['hidden']
            data_loader = create_data_loader((observation, hidden),
                                             batch_size=32,
                                             shuffle=False,
                                             drop_last=False)
            values = []
            for obs, hid in data_loader:
                obs = move_tensor_to_gpu(obs)
                hid = move_tensor_to_gpu(hid)
                values.append(policy_net.forward(obs, hid)[-1])
            values = torch.cat(values, dim=0).cpu().numpy()
            values = values * value_std + value_mean
            values = np.append(values, last_state_value)

        # add the value of last obs for truncated trajectory
        temporal_difference = path[
            'reward'][:-1] + values[1:] * gamma - values[:-1]
        # calculate reward-to-go
        gae = discount(temporal_difference, gamma * lam).astype(np.float32)
        gaes.append(gae)

    rewards = np.concatenate(rewards)
    new_values_mean, new_values_std = np.mean(rewards), np.std(rewards)
    rewards = (rewards - new_values_mean) / (new_values_std + eps)

    gaes = np.concatenate(gaes)
    gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + eps)

    return rewards, gaes, new_values_mean, new_values_std
Exemplo n.º 13
0
    def update_policy(self, dataset, epoch=4):
        # construct a dataset using paths containing (action, observation, old_log_prob)
        if self.recurrent:
            data_loader = create_data_loader(dataset,
                                             batch_size=128,
                                             shuffle=False,
                                             drop_last=False)
        else:
            data_loader = create_data_loader(dataset,
                                             batch_size=128,
                                             shuffle=True,
                                             drop_last=False)

        for epoch_index in range(epoch):
            current_hidden = torch.tensor(
                np.expand_dims(self.init_hidden_unit, axis=0),
                requires_grad=False).type(FloatTensor)
            for batch_sample in data_loader:
                action, advantage, observation, discount_rewards, old_log_prob, mask = \
                    move_tensor_to_gpu(batch_sample)

                self.policy_optimizer.zero_grad()
                # update policy
                if not self.recurrent:
                    distribution, _, raw_baselines = self.policy_net.forward(
                        observation, None)
                    entropy_loss = distribution.entropy().mean()
                    log_prob = distribution.log_prob(action)
                else:
                    entropy_loss = []
                    log_prob = []
                    raw_baselines = []
                    zero_index = np.where(mask == 0)[0] + 1
                    zero_index = zero_index.tolist()
                    zero_index.insert(0, 0)

                    for i in range(len(zero_index) - 1):
                        start_index = zero_index[i]
                        end_index = zero_index[i + 1]
                        current_obs = observation[start_index:end_index]
                        current_actions = action[start_index:end_index]
                        current_dist, _, current_baseline = self.policy_net.forward(
                            current_obs, current_hidden)
                        current_hidden = torch.tensor(
                            np.expand_dims(self.init_hidden_unit, axis=0),
                            requires_grad=False).type(FloatTensor)
                        current_log_prob = current_dist.log_prob(
                            current_actions)

                        log_prob.append(current_log_prob)
                        raw_baselines.append(current_baseline)
                        entropy_loss.append(current_dist.entropy())

                    # last iteration
                    start_index = zero_index[-1]
                    if start_index < observation.shape[0]:
                        current_obs = observation[start_index:]
                        current_actions = action[start_index:]
                        current_dist, current_hidden, current_baseline = self.policy_net.forward(
                            current_obs, current_hidden)

                        current_log_prob = current_dist.log_prob(
                            current_actions)

                        log_prob.append(current_log_prob)
                        raw_baselines.append(current_baseline)
                        entropy_loss.append(current_dist.entropy())
                        current_hidden = current_hidden.detach()

                    log_prob = torch.cat(log_prob, dim=0)
                    raw_baselines = torch.cat(raw_baselines, dim=0)
                    entropy_loss = torch.cat(entropy_loss, dim=0).mean()

                assert log_prob.shape == advantage.shape, 'log_prob length {}, advantage length {}'.format(
                    log_prob.shape, advantage.shape)

                # if approximated kl is larger than 1.5 target_kl, we early stop training of this batch
                negative_approx_kl = log_prob - old_log_prob

                negative_approx_kl_mean = torch.mean(
                    -negative_approx_kl).item()

                if negative_approx_kl_mean > 1.5 * self.target_kl:
                    # print('Early stopping this iteration. Current kl {:.4f}. Current epoch index {}'.format(
                    #     negative_approx_kl_mean, epoch_index))
                    continue

                ratio = torch.exp(negative_approx_kl)
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * advantage
                policy_loss = -torch.min(surr1, surr2).mean()

                value_loss = self.get_baseline_loss(raw_baselines,
                                                    discount_rewards)

                loss = policy_loss - entropy_loss * self.entropy_coef + self.value_coef * value_loss

                nn.utils.clip_grad_norm_(self.policy_net.parameters(),
                                         self.max_grad_norm)

                loss.backward()
                self.policy_optimizer.step()