Пример #1
0
    def test_pad_tensor(self):
        results = pad_tensor(self.tensor, self.max_len)
        assert len(self.tensor) == 3
        assert np.array_equal(results, [1, 1, 1, 0, 0, 0, 0, 0, 0, 0])

        results = pad_tensor(self.tensor, self.max_len, mode='last')
        assert np.array_equal(results, [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
Пример #2
0
def sliding_window(t, window, step_size, smear=False):
    if window > t.shape[0]:
        raise ValueError("`window` must be <= `t.shape[0]`")
    elif window == t.shape[0]:
        return np.stack([t] * window)

    # TODO(gh/19): this is broken for other step sizes. The problem may be with
    # the transpose trick
    if step_size != 1:
        raise NotImplementedError

    # The stride trick works only on the last dimension of an ndarray, so we
    # operate on the transpose, which reverses the dimensions of t.
    t_T = t.T

    shape = t_T.shape[:-1] + (t_T.shape[-1] - window + 1 - step_size, window)
    strides = t_T.strides + (t_T.strides[-1] * step_size, )
    t_T_win = np.lib.stride_tricks.as_strided(t_T,
                                              shape=shape,
                                              strides=strides)

    # t_T_win has shape (d_k, d_k-1, ..., (n - window_size), window_size)
    # To arrive at the final shape, we first transpose the result to arrive at
    # (window_size, (n - window_size), d_1, ..., d_k), then swap the firs two
    # axes
    t_win = np.swapaxes(t_T_win.T, 0, 1)

    # Optionally smear the last element to preserve the first dimension
    if smear:
        t_win = tensor_utils.pad_tensor(t_win, t.shape[0], mode='last')

    return t_win
Пример #3
0
    def optimize_policy(self, itr, samples_data):
        # Init vars
        rewards = samples_data['rewards']
        actions = samples_data['actions']
        observations = samples_data['observations']

        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        if self.policy.recurrent:
            recurrent_vals = [samples_data["valids"]]
        else:
            recurrent_vals = []
        # Compute sample Bellman error.
        feat_diff = []
        for path in samples_data['paths']:
            feats = self._features(path)
            feats = np.vstack([feats, np.zeros(feats.shape[1])])
            feat_diff.append(feats[1:] - feats[:-1])
        if self.policy.recurrent:
            max_path_length = max(
                [len(path["advantages"]) for path in samples_data["paths"]])
            # pad feature diffs
            feat_diff = np.array([
                tensor_utils.pad_tensor(fd, max_path_length)
                for fd in feat_diff
            ])
        else:
            feat_diff = np.vstack(feat_diff)

        #################
        # Optimize dual #
        #################

        # Here we need to optimize dual through BFGS in order to obtain \eta
        # value. Initialize dual function g(\theta, v). \eta > 0
        # First eval delta_v
        f_dual = self.opt_info['f_dual']
        f_dual_grad = self.opt_info['f_dual_grad']

        # Set BFGS eval function
        def eval_dual(input):
            param_eta = input[0]
            param_v = input[1:]
            val = f_dual(*([rewards, feat_diff] + state_info_list +
                           recurrent_vals + [param_eta, param_v]))
            return val.astype(np.float64)

        # Set BFGS gradient eval function
        def eval_dual_grad(input):
            param_eta = input[0]
            param_v = input[1:]
            grad = f_dual_grad(*([rewards, feat_diff] + state_info_list +
                                 recurrent_vals + [param_eta, param_v]))
            eta_grad = np.float(grad[0])
            v_grad = grad[1]
            return np.hstack([eta_grad, v_grad])

        # Initial BFGS parameter values.
        x0 = np.hstack([self.param_eta, self.param_v])

        # Set parameter boundaries: \eta>0, v unrestricted.
        bounds = [(-np.inf, np.inf) for _ in x0]
        bounds[0] = (0., np.inf)

        # Optimize through BFGS
        logger.log('optimizing dual')
        eta_before = x0[0]
        dual_before = eval_dual(x0)
        params_ast, _, _ = self.optimizer(func=eval_dual,
                                          x0=x0,
                                          fprime=eval_dual_grad,
                                          bounds=bounds,
                                          maxiter=self.max_opt_itr,
                                          disp=0)
        dual_after = eval_dual(params_ast)

        # Optimal values have been obtained
        self.param_eta = params_ast[0]
        self.param_v = params_ast[1:]

        ###################
        # Optimize policy #
        ###################
        cur_params = self.policy.get_param_values(trainable=True)
        f_loss = self.opt_info["f_loss"]
        f_loss_grad = self.opt_info['f_loss_grad']
        input = [
            rewards, observations, feat_diff, actions
        ] + state_info_list + recurrent_vals + [self.param_eta, self.param_v]

        # Set loss eval function
        def eval_loss(params):
            self.policy.set_param_values(params, trainable=True)
            val = f_loss(*input)
            return val.astype(np.float64)

        # Set loss gradient eval function
        def eval_loss_grad(params):
            self.policy.set_param_values(params, trainable=True)
            grad = f_loss_grad(*input)
            flattened_grad = tensor_utils.flatten_tensors(
                list(map(np.asarray, grad)))
            return flattened_grad.astype(np.float64)

        loss_before = eval_loss(cur_params)
        logger.log('optimizing policy')
        params_ast, _, _ = self.optimizer(func=eval_loss,
                                          x0=cur_params,
                                          fprime=eval_loss_grad,
                                          disp=0,
                                          maxiter=self.max_opt_itr)
        loss_after = eval_loss(params_ast)

        f_kl = self.opt_info['f_kl']

        mean_kl = f_kl(*([observations, actions] + state_info_list +
                         dist_info_list + recurrent_vals)).astype(np.float64)

        logger.log('eta %f -> %f' % (eta_before, self.param_eta))

        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)
        logger.record_tabular('DualBefore', dual_before)
        logger.record_tabular('DualAfter', dual_after)
        logger.record_tabular('MeanKL', mean_kl)
Пример #4
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, 'predict_n'):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path['rewards'] + \
                self.algo.discount * path_baselines[1:] - path_baselines[:-1]
            path['advantages'] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path['returns'] = special.discount_cumsum(path['rewards'],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path['returns'])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path['observations'] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path['actions'] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path['rewards'] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path['returns'] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path['advantages'] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path['env_infos'] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path['agent_infos'] for path in paths])

            if self.algo.center_adv:
                advantages = utils.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = utils.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path['returns'][0] for path in paths])

            undiscounted_returns = [sum(path['rewards']) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path['advantages']) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path['observations'] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path['advantages'] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path['advantages'] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path['advantages'] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path['actions'] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path['rewards'] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path['returns'] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path['agent_infos'] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path['env_infos'] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path['returns']) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path['returns'][0] for path in paths])

            undiscounted_returns = [sum(path['rewards']) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log('fitting baseline...')
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log('fitted')

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('ExplainedVariance', ev)
        tabular.record('NumTrajs', len(paths))
        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data