Пример #1
0
 def predict_normalized_delta_next_state_reward(self, states, actions):
     states_normalized = normalize(states, self.state_mean, self.state_std)
     if not self.dynamics_model.discrete:
         actions = normalize(actions, self.action_mean, self.action_std)
     predicted_delta_state_normalized, predicted_reward_normalized = self.dynamics_model.forward(
         states_normalized, actions)
     return predicted_delta_state_normalized, predicted_reward_normalized
Пример #2
0
    def fit_dynamic_model(self,
                          dataset: Dataset,
                          epoch=10,
                          batch_size=128,
                          verbose=False):
        t = range(epoch)
        if verbose:
            t = tqdm(t)

        train_data_loader, val_data_loader = dataset.random_iterator(
            batch_size=batch_size)

        for i in t:
            losses = []
            for states, actions, next_states, _, _ in train_data_loader:
                # convert to tensor
                states = move_tensor_to_gpu(states)
                actions = move_tensor_to_gpu(actions)
                next_states = move_tensor_to_gpu(next_states)
                delta_states = next_states - states
                # calculate loss
                self.optimizer.zero_grad()
                predicted_delta_state_normalized = self.predict_normalized_delta_next_state(
                    states, actions)
                delta_states_normalized = normalize(delta_states,
                                                    self.delta_state_mean,
                                                    self.delta_state_std)
                loss = F.mse_loss(predicted_delta_state_normalized,
                                  delta_states_normalized)
                loss.backward()
                self.optimizer.step()
                losses.append(loss.item())

            self.eval()
            val_losses = []
            with torch.no_grad():
                for states, actions, next_states, _, _ in val_data_loader:
                    # convert to tensor
                    states = move_tensor_to_gpu(states)
                    actions = move_tensor_to_gpu(actions)
                    next_states = move_tensor_to_gpu(next_states)
                    delta_states = next_states - states
                    predicted_delta_state_normalized = self.predict_normalized_delta_next_state(
                        states, actions)
                    delta_states_normalized = normalize(
                        delta_states, self.delta_state_mean,
                        self.delta_state_std)
                    loss = F.mse_loss(predicted_delta_state_normalized,
                                      delta_states_normalized)
                    val_losses.append(loss.item())
            self.train()

            if verbose:
                t.set_description(
                    'Epoch {}/{} - Avg model train loss: {:.4f} - Avg model val loss: {:.4f}'
                    .format(i + 1, epoch, np.mean(losses),
                            np.mean(val_losses)))
Пример #3
0
    def predict_next_states(self, states, actions):
        assert self.state_mean is not None, 'Please set statistics before training for inference.'
        states_normalized = normalize(states, self.state_mean, self.state_std)

        if not self.dynamics_model.discrete:
            actions = normalize(actions, self.action_mean, self.action_std)

        predicted_delta_state_normalized = self.dynamics_model.forward(
            states_normalized, actions)
        predicted_delta_state = unnormalize(predicted_delta_state_normalized,
                                            self.delta_state_mean,
                                            self.delta_state_std)
        return states + predicted_delta_state
Пример #4
0
    def predict_next_states(self, states, actions, z=None):
        assert self.state_mean is not None, 'Please set statistics before training for inference.'
        states = normalize(states, self.state_mean, self.state_std)

        if not self.dynamics_model.discrete:
            actions = normalize(actions, self.action_mean, self.action_std)

        if z is None:
            z = self._sample_latent_code(states.shape[0])

        predicted_states_normalized = self.dynamics_model.forward(
            states, actions, z)
        predicted_states = unnormalize(predicted_states_normalized,
                                       self.state_mean, self.state_std)
        return predicted_states
Пример #5
0
    def random_iterator(self, batch_size):
        """Create an iterator of all the dataset and update value mean and std


        Args:
            batch_size:

        Returns:

        """
        states = np.concatenate(
            [trajectory.state for trajectory in self.memory], axis=0)
        actions = np.concatenate(
            [trajectory.action for trajectory in self.memory], axis=0)
        reward_to_go = np.concatenate(
            [trajectory.reward_to_go for trajectory in self.memory], axis=0)
        gaes = np.concatenate(
            [trajectory.advantage for trajectory in self.memory], axis=0)
        old_log_prob = np.concatenate(
            [trajectory.old_log_prob for trajectory in self.memory], axis=0)

        value_mean, value_std = np.mean(reward_to_go), np.std(reward_to_go)
        reward_to_go = normalize(reward_to_go, value_mean, value_std)

        self.running_value_mean = self.running_value_mean * self.alpha + value_mean * (
            1 - self.alpha)
        self.running_value_std = self.running_value_std * self.alpha + value_std * (
            1 - self.alpha)

        gaes = normalize(gaes, np.mean(gaes), np.std(gaes))

        batch_size = min(batch_size, states.shape[0])

        data_loader = create_data_loader(
            (states, actions, reward_to_go, gaes, old_log_prob),
            batch_size=batch_size,
            shuffle=True,
            drop_last=True)

        return data_loader
Пример #6
0
    def predict_next_states(self, states, actions):
        """

        Args:
            states: (batch_size, window_length, 6)
            actions: (batch_size, window_length, 4)

        Returns: next obs of shape (batch_size, 6)

        """
        assert self.state_mean is not None, 'Please set statistics before training for inference.'
        states_normalized = normalize(states, self.state_mean, self.state_std)

        if not self.dynamics_model.discrete:
            actions = normalize(actions, self.action_mean, self.action_std)

        predicted_delta_state_normalized = self.dynamics_model.forward(
            states_normalized, actions)
        predicted_delta_state = unnormalize(predicted_delta_state_normalized,
                                            self.delta_state_mean,
                                            self.delta_state_std)
        return states[:, -1, :] + predicted_delta_state
Пример #7
0
    def fit_dynamic_model(self,
                          dataset,
                          epoch=10,
                          batch_size=128,
                          logger=None):
        t = tqdm(range(epoch))

        train_data_loader, val_data_loader = dataset.random_iterator(
            batch_size=batch_size)

        for i in t:
            losses = []
            for states, actions, next_states, rewards, _ in train_data_loader:

                # in training, we drop last batch to avoid batch size 1 that may crash batch_norm layer.
                if states.shape[0] == 1:
                    continue

                # convert to tensor
                states = move_tensor_to_gpu(states)
                actions = move_tensor_to_gpu(actions)
                next_states = move_tensor_to_gpu(next_states)
                rewards = move_tensor_to_gpu(rewards)
                delta_states = next_states - states
                # calculate loss
                self.optimizer.zero_grad()
                predicted_delta_state_normalized, predicted_reward_normalized = \
                    self.predict_normalized_delta_next_state_reward(states, actions)
                delta_states_normalized = normalize(delta_states,
                                                    self.delta_state_mean,
                                                    self.delta_state_std)
                loss = F.mse_loss(predicted_delta_state_normalized,
                                  delta_states_normalized)
                if self.cost_fn_batch is None:
                    rewards_normalized = normalize(rewards, self.reward_mean,
                                                   self.reward_std)
                    loss += F.mse_loss(predicted_reward_normalized,
                                       rewards_normalized)
                loss.backward()
                self.optimizer.step()
                losses.append(loss.item())

            self.eval()
            val_losses = []
            with torch.no_grad():
                for states, actions, next_states, rewards, _ in val_data_loader:
                    # convert to tensor
                    states = move_tensor_to_gpu(states)
                    actions = move_tensor_to_gpu(actions)
                    next_states = move_tensor_to_gpu(next_states)
                    rewards = move_tensor_to_gpu(rewards)
                    delta_states = next_states - states
                    predicted_delta_state_normalized, predicted_reward_normalized = \
                        self.predict_normalized_delta_next_state_reward(states, actions)
                    delta_states_normalized = normalize(
                        delta_states, self.delta_state_mean,
                        self.delta_state_std)
                    loss = F.mse_loss(predicted_delta_state_normalized,
                                      delta_states_normalized)
                    if self.cost_fn_batch is None:
                        rewards_normalized = normalize(rewards,
                                                       self.reward_mean,
                                                       self.reward_std)
                        loss += F.mse_loss(predicted_reward_normalized,
                                           rewards_normalized)
                    val_losses.append(loss.item())
            self.train()

            if logger:
                logger.store(ModelTrainLoss=np.mean(losses))
                logger.store(ModelValLoss=np.mean(val_losses))

            t.set_description(
                'Epoch {}/{} - Avg model train loss: {:.4f} - Avg model val loss: {:.4f}'
                .format(i + 1, epoch, np.mean(losses), np.mean(val_losses)))