Пример #1
0
    def step_update(self, state, action, reward, next_state, done):
        self.step_counter += 1
        self.discounted_sum_reward += self.current_disc * reward
        self.current_disc *= self.discount_factor

        if self.step_counter >= self.update_offset and self.step_counter % self.update_frequency == 0:
            if len(self.discounted_sum_rewards) > 0 and len(self.context_buffer) > 0:
                self.algorithm_iterations += 1
                avg_performance = np.mean(self.discounted_sum_rewards)
                self.discounted_sum_rewards = []

                ins, cons = self.context_buffer.read_buffer()
                initial_states, contexts = np.array(ins), np.array(cons)
                values = self.value_estimator(initial_states)
                if values is None:
                    raise Exception("Please define a valid value estimator, this one returns None...")

                old_context_dist = deepcopy(self.context_dist)
                contexts_t = to_float_tensor(contexts, use_cuda=False)
                old_c_log_prob_t = old_context_dist.log_pdf_t(contexts_t).detach()

                # Estimate the value of the state after the policy update
                c_val_t = to_float_tensor(values, use_cuda=False)

                # Add the penalty term
                cur_kl_t = self.target_context_kl(numpy=False)
                if self.use_avg_performance:
                    alpha_cur_t = self.alpha_function(self.algorithm_iterations, avg_performance, cur_kl_t)
                else:
                    alpha_cur_t = self.alpha_function(self.algorithm_iterations, torch.mean(c_val_t).detach(), cur_kl_t)

                cg_step(partial(self._compute_context_loss, contexts_t, old_c_log_prob_t, c_val_t, alpha_cur_t),
                        partial(self._compute_context_kl, old_context_dist), self.max_kl,
                        self.context_dist.parameters, self.context_dist.set_weights,
                        self.context_dist.get_weights, **self.cg_parameters, use_cuda=False)

                cov = self.context_dist._chol_flat.detach().numpy()
                if self.std_lower_bound is not None and self.target_context_kl() > self.kl_threshold:
                    cov[0:self.context_dim] = np.log(np.maximum(np.exp(cov[0:self.context_dim]), self.std_lower_bound))
                    self.context_dist.set_weights(np.concatenate((self.context_dist.mean(), cov)))
                self.bk["mean"].append(self.context_dist.mean())
                self.bk["covariance"].append(self.context_dist.covariance_matrix())
                self.bk["steps"].append(self.step_counter)
                self.bk["algo_iterations"].append(self.algorithm_iterations)
                self.bk["kl"].append(self.target_context_kl())
            else:
                print("Skipping iteration at step {} because buffers are empty.".format(self.step_counter))
 def log_pdf(self, x):
     x = to_float_tensor(x, self._use_cuda)
     return self.log_pdf_t(x).detach().cpu().numpy()