예제 #1
0
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Train policy network
        logger.log('Computing loss before')
        loss_before = self.optimizer.loss(policy_opt_input_values)
        logger.log('Computing KL before')
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        logger.log('Optimizing')
        self.optimizer.optimize(policy_opt_input_values)
        logger.log('Computing KL after')
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        logger.log('Computing loss after')
        loss_after = self.optimizer.loss(policy_opt_input_values)
        tabular.record('{}/LossBefore'.format(self.policy.name), loss_before)
        tabular.record('{}/LossAfter'.format(self.policy.name), loss_after)
        tabular.record('{}/dLoss'.format(self.policy.name),
                       loss_before - loss_after)
        tabular.record('{}/KLBefore'.format(self.policy.name),
                       policy_kl_before)
        tabular.record('{}/KL'.format(self.policy.name), policy_kl)

        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        tabular.record('{}/Entropy'.format(self.policy.name), pol_ent)

        num_traj = self.batch_size // self.max_path_length
        actions = samples_data['actions'][:num_traj, ...]

        histogram = Histogram(actions)
        tabular.record('{}/Actions'.format(self.policy.name), histogram)

        self._fit_baseline(samples_data)

        return self.get_itr_snapshot(itr, samples_data)
예제 #2
0
    def test_record_histogram(self):
        with mock.patch('tensorboardX.SummaryWriter'):
            norm = scipy.stats.norm(loc=[1., 0.], scale=[0.5, 1.5])
            samples = norm.rvs((10000, 2))
            hist = Histogram(samples)
            self.tensor_board_output.record('Samples', hist)
            self.tensor_board_output.dump()

            assert self.mock_writer.add_histogram.call_count == 1
예제 #3
0
    def _visualize_distribution(self):
        """Visualize encoder distribution."""
        num_tasks = self.policy.task_space.flat_dim
        all_tasks = np.eye(num_tasks, num_tasks)
        _, latent_infos = self.policy.encoder.get_latents(all_tasks)

        for task in range(num_tasks):
            for i in range(self.policy.latent_space.flat_dim):
                stds = np.exp(latent_infos['log_std'][task, i])

                norm = scipy.stats.norm(loc=latent_infos['mean'][task, i],
                                        scale=stds)
                samples = norm.rvs(100)
                hist = Histogram(samples)
                tabular.record('Encoder/task={},i={}'.format(task, i), hist)