Exemplo n.º 1
0
 def __init__(self, x_shape, y_shape, name='supervised_learner',
              max_n_samples=0,  # number of samples to keep
              max_n_batches=0,  # number of batches to keep
              **kwargs):
     super().__init__(x_shape, y_shape, name=name, **kwargs)
     self._dataset = Dataset(max_n_samples=max_n_samples,
                             max_n_batches=max_n_batches)
Exemplo n.º 2
0
    def update(self, ro):
        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            self.policy.update(xs=ro['obs_short'])

        # Mirror descent
        with timed('Update oracle'):

            if self._use_cv:
                # Split ro into two phases
                rollouts = ro.to_list()[:int(len(ro)/2)*2]  # even length
                ro_mix = rollouts[0:][::2]  # ro with random switch
                assert len(ro_mix)==len(self._t_switch) or len(ro_mix)==len(self._t_switch)-1
                # if a rollout too short, it is treated as zero
                ro_exp = []
                for r, t, s in zip(ro_mix, self._t_switch, self._scale):
                    if len(r)>=t:
                        r = r[t-1:]
                        r.scale = s
                        ro_exp.append(r)
                ro_exp = Dataset(ro_exp)
                ro_pol = Dataset(rollouts[1:][::2])
                _, ev0, ev1 = self.oracle.update(ro_exp=ro_exp,
                                                 ro_pol=ro_pol,
                                                 policy=self.policy)
                # for adaptive sampling
                self._avg_n_steps.update(np.mean([len(r) for r in ro_pol]))
            else:
                [setattr(r,'scale',s) for r,s in zip(ro, self._scale)]
                _, ev0, ev1 = self.oracle.update(ro_exp=ro,
                                                 policy=self.policy)

        with timed('Compute policy gradient'):
            g = self.oracle.grad(self.policy)

        with timed('Policy update'):
            self.learner.update(g)
            self.policy.variable = self.learner.x

        # log
        logz.log_tabular('stepsize', self.learner.stepsize)
        logz.log_tabular('std', np.mean(np.exp(2.*self.policy.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        logz.log_tabular('ExplainVarianceBefore(AE)', ev0)
        logz.log_tabular('ExplainVarianceAfter(AE)', ev1)
        if self._use_cv:
            logz.log_tabular('NumberOfExpertRollouts', len(ro_exp))
            logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol))
        else:
            logz.log_tabular('NumberOfExpertRollouts', len(ro))


        # reset
        self._reset_pi_ro()
        self._itr+=1
Exemplo n.º 3
0
 def __init__(self, ref_policy, name='advantage_func_app',
              max_n_rollouts=float('Inf'),  # number of samples (i.e. rollouts) to keep
              max_n_batches=0,  # number of batches (i.e. iterations) to keep
              **kwargs):
     # replay buffer (the user should append ro)
     self.buffer = Dataset(max_n_batches=max_n_batches, max_n_samples=max_n_rollouts)
     assert isinstance(ref_policy, Policy)
     self.ref_policy = ref_policy  # reference policy
     self._ob_shape = ref_policy.x_shape
     self._ac_shape = ref_policy.y_shape
     super().__init__([self._ob_shape, self._ac_shape], (1,), name=name, **kwargs)
Exemplo n.º 4
0
class SupervisedLearner(FunctionApproximator):
    """ FunctionApproximator trained on aggregated data. """

    def __init__(self, x_shape, y_shape, name='supervised_learner',
                 max_n_samples=0,  # number of samples to keep
                 max_n_batches=0,  # number of batches to keep
                 **kwargs):
        super().__init__(x_shape, y_shape, name=name, **kwargs)
        self._dataset = Dataset(max_n_samples=max_n_samples,
                                max_n_batches=max_n_batches)

    def as_funcapp(self):
        """ Return a new copy but without the dataset and update rules. """
        new = copy.copy(self)
        new._dataset = None
        new.update = None
        new.update_funcapp = None
        return new

    def update(self, xs, ys, ws=1.0, **kwargs):
        """ Update the function approximator through supervised learning

            xs, ys, and ws are inputs, outputs, and weights.
        """
        assert len(xs.shape)>1 and len(ys.shape)>1
        super().update(xs, ys, ws, **kwargs)
        # update dataset
        ws = np.ones(xs.shape[0])*ws if type(ws) is not np.ndarray else ws
        assert xs.shape[0] == ys.shape[0] == ws.shape[0]
        self._dataset.append(Data(xs=xs, ys=ys, ws=ws))

        # update function approximator
        ev0 = compute_explained_variance(self(xs), ys)
        results = self.update_funcapp(**kwargs)  # return logs, if any
        ev1 = compute_explained_variance(self(xs), ys)

        return results, ev0, ev1

    @abstractmethod
    def update_funcapp(self, **kwargs):
        """ Update the function approximator based on the aggregated dataset.
Exemplo n.º 5
0
    def update(self, ro_exp=None, ro_pol=None, policy=None, update_nor=True, **kwargs):
        """ Need to provide either `ro_exp` or `ro_pol`, and `policy`.

            `ro_exp` is used to compute an unbiased but noisy estimate of

                E_{pi}[\nabla \pi(s,a) \hat{A}_{\pi^*}(s,a)]

            when \hat{A}_{\pi^*} given by `self._or` is unbiased.

            `ro_pol` provides a biased gradient which can be used as a control
            variate (when `ro_exp` is provided) or just to define a biased
            oracle.
        """
        assert (ro_exp is not None) or (ro_pol is not None)
        assert policy is not None

        # Sync policies' parameters.
        self._policy.assign(policy) # NOTE sync BOTH variables and parameters
        # Update the oracles
        n_rollouts = len(ro_exp) if ro_pol is None else len(ro_pol)
        self._ro_or = None
        if ro_exp is not None:
            # compute adv
            if len(ro_exp)>0:
                advs, _ = self._ae.advs(ro_exp, use_is=self._use_is)
                advs = [a[0:1]*r.scale for a, r in zip(advs, ro_exp)]
                adv = np.concatenate(advs)
                if ro_pol is not None:  # compute the control variate
                    advs_cv, _ = self._ae.advs(ro_exp, use_is=self._use_is, lambd=0.)
                    advs_cv = [a[0:1]*r.scale for a,r  in zip(advs_cv, ro_exp)]
                    adv -= np.concatenate(advs_cv)
                logq = np.concatenate([r.lps[0:1] for r in ro_exp])
                # update noisy oracle
                self._scale_or = len(adv)/n_rollouts
                self._or.update(-adv, logq, update_nor=update_nor) # loss is negative reward
                self._ro_or = Dataset([r[0:1] for r in ro_exp])  # for defining logp

        self._ro_cv = None
        if ro_pol is not None:
            # update biased oracle
            advs, _ = self._ae.advs(ro_pol, use_is=self._use_is, lambd=0.)
            adv = np.concatenate(advs)
            self._scale_cv = len(adv)/n_rollouts
            logq = ro_pol['lps']
            self._cv.update(-adv, logq, update_nor=update_nor) # loss is negative reward
            self._ro_cv = ro_pol  # for defining logp

        # Update the value function at the end, so it's unbiased.
        if ro_exp is not None:
            return self._ae.update(ro_exp, **kwargs)
        else:  # when biased gradient is used
            return self._ae.update(ro_pol, **kwargs)
Exemplo n.º 6
0
    def split(self, ro, policy_as_expert):
        # Split ro into two phases
        rollouts = ro.to_list()
        ro_mix = [rollouts[i] for i in self._ind_ro_mix]
        ro_pol = [rollouts[i] for i in self._ind_ro_pol]
        assert (len(ro_mix) + len(ro_pol)) == len(rollouts)
        ro_exps = [[] for _ in range(len(self.experts))]
        for r, t, s, k in zipsame(ro_mix, self._t_switch, self._scale,
                                  self._k_star):
            assert len(r) >= t  # because t >= 1
            if not policy_as_expert or k < len(self.experts) - 1:
                # we assume the last expert is the learner
                r = r[t:]
            r.weight = 1.0
            ro_exps[k].append(r)
        if policy_as_expert:
            ro_pol += ro_exps[-1]
            del ro_exps[-1]
        ro_exps = [Dataset(ro_exp) for ro_exp in ro_exps]
        ro_pol = Dataset(ro_pol)

        return ro_exps, ro_pol
Exemplo n.º 7
0
def generate_rollout(pi,
                     logp,
                     env,
                     callback=None,
                     v_end=None,
                     t_state=None,
                     rw_shaping=None,
                     min_n_samples=None,
                     max_n_rollouts=None,
                     min_n_rollouts=0,
                     max_rollout_len=None,
                     with_animation=False):
    """ Collect rollouts until we have enough samples or rollouts.

        Each rollout is generated by repeatedly calling the behavior `pi`. At
        the end of the rollout, the statistics (e.g. observations, actions) are
        packaged as a Rollout object and then `logp` is called **once** to save
        the log probability of the behavior policy `pi`.

        All rollouts are COMPLETE in that they never end prematurely, even when
        `min_n_samples` is reached. They end either when `done` is true, or
        `max_rollout_len` is reached, or `pi` returns None.

        Args:
            `pi`: the behavior policy, which takes (observation, time, done)
                  and returns the action or None. If None is returned, the
                  rollout terminates. done, here, is treated as special symbol
                  of state. If `pi` returns None, the rollout will be
                  terminated.

            `logp`: either None or a function that maps (obs, acs) to log
                    probabilities (called at end of each rollout)

            `env`: a gym-like environment

            `v_end`: the terminal value when the episoide ends (a callable
                     function of observation and done)

            `t_state`: a function that maps time to desired features

            `rw_shaping`: a function that maps a reward to the new reward

            `max_rollout_len`: the maximal length of a rollout (i.e. the problem's horizon)

            `min_n_samples`: the minimal number of samples to collect

            `max_n_rollouts`: the maximal number of rollouts,

            `min_n_rollouts`: the minimal number of rollouts,

            `with_animation`: display animiation of the first rollout

    """
    # Configs
    assert (min_n_samples is not None) or (max_n_rollouts is not None
                                           )  # so we can stop
    min_n_samples = min_n_samples or float('Inf')
    max_n_rollouts = max_n_rollouts or float('Inf')
    min_n_rollouts = min(min_n_rollouts, max_n_rollouts)
    max_rollout_len = max_rollout_len or float('Inf')
    max_episode_steps = getattr(env, '_max_episode_steps', float('Inf'))
    max_rollout_len = min(max_episode_steps, max_rollout_len)

    if v_end is None:

        def v_end(ob, dn):
            return 0.

    if rw_shaping is None:

        def rw_shaping(rw, ob, ac):
            return rw

    def post_process(x, t):
        # Augment observation with time information, if needed.
        return x if t_state is None else np.concatenate(
            [x.flatten(), (t_state(t), )])

    def step(ac, tm):
        ob, rw, dn, info = env.step(ac)  # current reward, next ob and dn
        return post_process(ob, tm), rw, dn, info

    def reset(tm):
        ob = env.reset()
        return post_process(ob, tm)

    # Start trajectory-wise rollouts.
    n_samples = 0
    rollouts = []
    while True:
        animate_this_rollout = len(rollouts) == 0 and with_animation
        obs, acs, rws, = [], [], []
        tm = 0  # time step
        dn = False
        ob = reset(tm)
        # each trajectory
        while True:
            if animate_this_rollout:
                env.render()
                time.sleep(0.05)
            ac = pi(ob, tm, dn)  # apply action and get to the next state
            if ac is None:
                dn = False  # the learner decides to stop collecting data
                break
            # ob, ac, rw are at tm
            obs.append(ob)
            acs.append(ac)
            ob, rw, dn, _ = step(ac, tm)
            rw = rw_shaping(rw, ob, ac)
            rws.append(rw)
            tm += 1
            if dn or tm >= max_rollout_len:
                break  # due to steps limit or entering an absorbing state
        # save the terminal observation/reward
        obs.append(ob)
        rws.append(v_end(ob, dn))  # terminal reward
        # end of one rollout (`logp` is called once)
        rollout = Rollout(obs=obs, acs=acs, rws=rws, done=dn, logp=logp)
        if callback is not None:
            callback(rollout)
        rollouts.append(rollout)
        n_samples += len(rollout)
        if (n_samples >= min_n_samples) or (len(rollouts) >= max_n_rollouts):
            if len(rollouts) >= min_n_rollouts:
                break
    ro = Dataset(rollouts)
    return ro