Exemplo n.º 1
0
 def _update_func_approx(self, x, y, w, to_log=False, log_prefix=''):
     """ Update the function approximator based on the current data (x, y,
     w) or through self._agg_data which is up-to-date with (x, y, w). """
     # initial loss
     loss_before = self._compute_loss(x, y,
                                      w)  # just on the current sample?
     explained_variance_before = math_utils.compute_explained_variance(
         self.predict(x), y)
     # optimization
     self.prepare_for_update(x)
     x_agg, y_agg, w_agg = self._agg_data['x'], self._agg_data[
         'y'], self._agg_data['w']
     lr = self._update_with_lr_search(
         x_agg, y_agg, w_agg)  # using aggregated data to update
     # new loss
     loss_after = self._compute_loss(x, y, w)
     explained_variance_after = math_utils.compute_explained_variance(
         self.predict(x), y)
     if to_log:
         logz.log_tabular(
             'LossBefore({}){}'.format(self.name, log_prefix),
             loss_before)
         logz.log_tabular(
             'LossAfter({}){}'.format(self.name, log_prefix),
             loss_after)
         logz.log_tabular(
             'ExplainedVarianceBefore({}){}'.format(
                 self.name, log_prefix), explained_variance_before)
         logz.log_tabular(
             'ExplainedVarianceAfter({}){}'.format(
                 self.name, log_prefix), explained_variance_after)
         logz.log_tabular(
             'UsedLearningRate({}){}'.format(self.name, log_prefix), lr)
Exemplo n.º 2
0
    def update(self, ros, agents):
        # Aggregate data
        ro = self.merge(ros)

        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            self.policy.update(xs=ro['obs_short'])

        with timed('Update oracle'):
            _, ev0, ev1 = self.oracle.update(ro, self.policy)

        with timed('Compute policy gradient'):
            g = self.oracle.grad(self.policy.variable)

        with timed('Policy update'):
            if isinstance(self.learner, ol.FisherOnlineOptimizer):
                if self._optimizer=='trpo_wl':  # use also the loss function
                    self.learner.update(g, ro=ro, policy=self.policy, loss_fun=self.oracle.fun)
                else:
                    self.learner.update(g, ro=ro, policy=self.policy)
            else:
                self.learner.update(g)
            self.policy.variable = self.learner.x

        # log
        logz.log_tabular('stepsize', self.learner.stepsize)
        if hasattr(self.policy,'lstd'):
            logz.log_tabular('std', np.mean(np.exp(self.policy.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        logz.log_tabular('ExplainVarianceBefore(AE)', ev0)
        logz.log_tabular('ExplainVarianceAfter(AE)', ev1)

        self._itr +=1
Exemplo n.º 3
0
    def update(self,
               ro,
               update_nor=False,
               shift_adv=False,
               to_log=False,
               log_prefix=''):
        """
            Args:
                ro: RO object representing the new information
                update_nor: whether to update the  control variate of tfLikelihoodRatioOracle
                shift_adv: whether to force the adv values to be positive. if float, it specifies the
                    amount to shift.
        """

        self._ro = ro  # save the ref to rollouts

        # Compute adv.
        advs, vfns = self._ae.advs(ro)  # adv has its own ref_policy
        adv = np.concatenate(advs)
        if shift_adv:  # make adv non-negative
            assert self._use_log_loss
            if shift_adv is True:
                adv = adv - np.min(adv)
            else:
                adv = adv - np.mean(adv) + shift_adv
            self._nor.reset()  # defined in tfLikelihoodRatioOracle
            update_nor = False

        if not self._normalize_weighting:
            if self._avg_type == 'sum':  # rescale the problem if needed
                adv *= len(adv) / len(ro)

        # Update the loss function.
        if self._use_log_loss is True:
            #  - E_{ob} E_{ac ~ q | ob} [ w * log p(ac|ob) * adv(ob, ac) ]
            if self._onestep_weighting:  # consider importance weight
                w_or_logq = np.concatenate(
                    self._ae.weights(ro,
                                     policy=self.policy))  # helper function
            else:
                w_or_logq = np.ones_like(adv)
        else:  # False or None
            #  - E_{ob} E_{ac ~ q | ob} [ p(ac|ob)/q(ac|ob) * adv(ob, ac) ]
            assert self._onestep_weighting
            w_or_logq = ro.lps

        if to_log:
            vfn = np.concatenate(vfns)
            logz.log_tabular('max_adv', np.amax(np.abs(adv)))
            logz.log_tabular('max_vfn', np.amax(np.abs(vfn)))

        # Update the tfLikelihoodRatioOracle.
        super().update(-adv, w_or_logq, [ro.obs, ro.acs],
                       update_nor)  # loss is negative reward
Exemplo n.º 4
0
    def run(self,
            n_itrs,
            pretrain=True,
            seed=None,
            save_freq=None,
            eval_freq=None,
            final_eval=False,
            final_save=True):

        eval_policy = eval_freq is not None
        save_policy = save_freq is not None

        if seed is not None:
            set_randomseed(seed)
            self.mdp.env.seed(seed)

        start_time = time.time()
        if pretrain:
            self.alg.pretrain(functools.partial(self.gen_ro, to_log=False))

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)

            if eval_policy:
                if itr % eval_freq == 0:
                    self._eval_policy()

            with timed('Generate env rollouts'):
                ros, agents = self.gen_ro(self.alg.agent('behavior'),
                                          to_log=not eval_policy)
            self.alg.update(ros, agents)

            if save_policy:
                if itr % save_freq == 0:
                    self._save_policy(self.alg.policy, itr)
            # dump log
            logz.dump_tabular()

        # Save the final policy.
        if final_eval:
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr + 1)
            self._eval_policy()
            logz.dump_tabular()

        if final_save:
            self._save_policy(self.alg.policy, n_itrs)
            self._save_policy(self.best_policy, 'best')
Exemplo n.º 5
0
    def update(self, ros, agents):
        # Aggregate data
        ro = self.merge(ros)

        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            self.policy.update(xs=ro['obs_short'])

        # Below we update `distribution` where the variables are hosted.
        with timed('Update oracle'):
            _, err0, err1 = self.oracle.update(ro, self.distribution)  # dist.

        with timed('Compute policy gradient'):
            g = self.oracle.grad(self.distribution.variable)  # dist.

        with timed('Policy update'):
            if isinstance(self.learner, ol.FisherOnlineOptimizer):
                if self._optimizer == 'trpo_wl':  # use also the loss function
                    self.learner.update(g,
                                        ro=ro,
                                        policy=self.distribution,
                                        loss_fun=self.oracle.fun)  # dist.
                else:
                    self.learner.update(g, ro=ro,
                                        policy=self.distribution)  # dist.
            else:
                self.learner.update(g)
            self.distribution.variable = self.learner.x  # dist.

        # log
        logz.log_tabular('stepsize', self.learner.stepsize)
        if hasattr(self.distribution, 'lstd'):
            logz.log_tabular('std', np.mean(np.exp(self.distribution.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        logz.log_tabular('NrmseBefore(AE)', err0)
        logz.log_tabular('NrmseAfter(AE)', err1)

        self._itr += 1
Exemplo n.º 6
0
    def update(self, ros, agents):  # agents are behavior policies
        # Aggregate data
        data = [
            a.split(ro, self.policy_as_expert) for ro, a in zip(ros, agents)
        ]
        ro_exps = [d[0] for d in data]
        ro_exps = list(map(
            list,
            zip(*ro_exps)))  # transpose, s.t. len(ro_exps)==len(self.experts)
        ro_exps = [self.merge(ros) for ros in ro_exps]
        ro_pol = [d[1] for d in data]
        ro_pol = self.merge(ro_pol)

        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            ro = self.merge(ros)
            self.policy.update(xs=ro['obs_short'])

        with timed('Update oracle'):
            # Update the value function of the experts
            EV0, EV1 = [], []
            for k, ro_exp in enumerate(ro_exps):
                if len(ro_exp) > 0:
                    _, ev0, ev1 = self.aes[k].update(ro_exp)
                    EV0.append(ev0)
                    EV1.append(ev1)
            # Update oracle
            self.oracle.update(ro_pol, update_vfn=False, policy=self.policy)

            # Update the value function the learner (after oracle update so it unbiased)
            if self.policy_as_expert:
                _, ev0, ev1 = self.aes[-1].update(ro_pol)

            # For adaptive sampling
            self._avg_n_steps.update(np.mean([len(r) for r in ro_pol]))

        with timed('Compute gradient'):
            g = self.oracle.grad(self.policy.variable)

        with timed('Policy update'):
            if isinstance(self.learner, ol.FisherOnlineOptimizer):
                if self._optimizer == 'trpo_wl':  # use also the loss function
                    self.learner.update(g,
                                        ro=ro,
                                        policy=self.policy,
                                        loss_fun=self.oracle.fun)
                else:
                    self.learner.update(g, ro=ro, policy=self.policy)
            else:
                self.learner.update(g)
            self.policy.variable = self.learner.x

        # Log
        logz.log_tabular('stepsize', self.learner.stepsize)
        logz.log_tabular('std', np.mean(np.exp(2. * self.policy.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        if self.policy_as_expert:
            logz.log_tabular('ExplainVarianceBefore(AE)', ev0)
            logz.log_tabular('ExplainVarianceAfter(AE)', ev1)
        logz.log_tabular('MeanExplainVarianceBefore(AE)', np.mean(EV0))
        logz.log_tabular('MeanExplainVarianceAfter(AE)', np.mean(EV1))
        logz.log_tabular('NumberOfExpertRollouts',
                         np.sum([len(ro) for ro in ro_exps]))
        logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol))

        # Reset
        self._itr += 1
Exemplo n.º 7
0
    def gen_ro(self,
               agent,
               mdp=None,
               ro_kwargs=None,
               initialize=False,
               prefix='',
               to_log=False,
               eval_mode=False):
        """ Run the agent in the mdp and return rollout statistics as a Dataset
            and the agent that collects it.

            mpds, ro_kwargs can be either a single instance or a list.
        """
        ro_kwargs = ro_kwargs or self.ro_kwargs
        mdp = mdp or self.mdp

        # Make mdp, ro_kwargs as lists
        if not isinstance(mdp, list):
            mdp = [mdp]
        if not isinstance(ro_kwargs, list):
            ro_kwargs = [ro_kwargs]
        if len(mdp) > 1 and len(ro_kwargs) == 1:
            ro_kwargs *= len(mdp)
        assert len(mdp) == len(ro_kwargs)

        # Run the agent and log statistics
        ros_all, agents_all = [], []
        avg_performance = 0.
        for i, (m, kw) in enumerate(zip(mdp, ro_kwargs)):
            if initialize:  # so deterministic behaviors can be realized.
                m.initialize()
            ros, agents = m.run(agent, **kw)
            ros_all.extend(ros)
            agents_all.extend(agents)

            # Log
            ro = functools.reduce(lambda x, y: x + y, ros)
            if not eval_mode:
                self._n_rollouts += len(ro)
                self._n_samples += ro.n_samples
            if to_log:
                if len(mdp) > 1:
                    prefix = 'MDP' + str(i) + '_'
                # current ro
                gamma = m.gamma
                sum_of_rewards = [
                    ((gamma**np.arange(len(r.rws))) * r.rws).sum() for r in ro
                ]
                performance = np.mean(sum_of_rewards)
                if gamma < 1.:
                    avg_of_rewards = [(1 - gamma) * sr
                                      for sr, r in zip(sum_of_rewards, ro)]
                else:
                    avg_of_rewards = [
                        sr / len(r) for sr, r in zip(sum_of_rewards, ro)
                    ]

                performance_avg = np.mean(avg_of_rewards)
                rollout_lens = [len(rollout) for rollout in ro]
                n_samples = sum(rollout_lens)
                logz.log_tabular(prefix + "NumSamples", n_samples)
                logz.log_tabular(prefix + "NumberOfRollouts", len(ro))
                logz.log_tabular(prefix + "MeanAvgOfRewards", performance_avg)
                logz.log_tabular(prefix + "MeanSumOfRewards", performance)
                logz.log_tabular(prefix + "StdSumOfRewards",
                                 np.std(sum_of_rewards))
                logz.log_tabular(prefix + "MaxSumOfRewards",
                                 np.max(sum_of_rewards))
                logz.log_tabular(prefix + "MinSumOfRewards",
                                 np.min(sum_of_rewards))
                logz.log_tabular(prefix + "MeanRolloutLens",
                                 np.mean(rollout_lens))
                logz.log_tabular(prefix + "StdRolloutLens",
                                 np.std(rollout_lens))

                avg_performance += performance / len(mdp)

        if to_log:  # total
            if avg_performance >= self.best_performance:
                self.best_policy = copy.deepcopy(self.alg.policy)
                self.best_performance = avg_performance
            logz.log_tabular(prefix + 'TotalNumberOfSamples', self._n_samples)
            logz.log_tabular(prefix + 'TotalNumberOfRollouts',
                             self._n_rollouts)
            logz.log_tabular(prefix + 'BestSumOfRewards',
                             self.best_performance)

        return ros_all, agents_all
Exemplo n.º 8
0
    def update(self, ro):

        self._ro = ro
        if not self._ignore_samples:
            # update input normalizer for whitening
            self._policy.prepare_for_update(self._ro.obs)

            # Correction Step (Model-free)
            self._correction()

        # end of round
        self._itr += 1

        # log
        logz.log_tabular('pcl_stepsize', self._pcl.stepsize)
        logz.log_tabular('std', np.mean(self._policy.std))
        if not self._ignore_samples:
            logz.log_tabular('true_grads_size', np.linalg.norm(self._g))
            logz.log_tabular('pred_grads_size',
                             np.linalg.norm(self._pcl.g_hat))
            pred_error_size = np.linalg.norm(self._g - self._pcl.g_hat)
            ratio = pred_error_size / np.linalg.norm(self._g)
            logz.log_tabular('pred_error_size', pred_error_size)
            logz.log_tabular('pred_error_true_ratio', ratio)

        # Prediction Step (Model-based)
        if self._w_pred:
            self._prediction()

        # log
        logz.log_tabular('std_after', np.mean(self._policy.std))
Exemplo n.º 9
0
    def update(self, ro):
        # Update input normalizer for whitening
        if self._itr < self._n_warm_up_itrs:
            self.policy.update(xs=ro['obs_short'])

        # Mirror descent
        with timed('Update oracle'):

            if self._use_cv:
                # Split ro into two phases
                rollouts = ro.to_list()[:int(len(ro)/2)*2]  # even length
                ro_mix = rollouts[0:][::2]  # ro with random switch
                assert len(ro_mix)==len(self._t_switch) or len(ro_mix)==len(self._t_switch)-1
                # if a rollout too short, it is treated as zero
                ro_exp = []
                for r, t, s in zip(ro_mix, self._t_switch, self._scale):
                    if len(r)>=t:
                        r = r[t-1:]
                        r.scale = s
                        ro_exp.append(r)
                ro_exp = Dataset(ro_exp)
                ro_pol = Dataset(rollouts[1:][::2])
                _, ev0, ev1 = self.oracle.update(ro_exp=ro_exp,
                                                 ro_pol=ro_pol,
                                                 policy=self.policy)
                # for adaptive sampling
                self._avg_n_steps.update(np.mean([len(r) for r in ro_pol]))
            else:
                [setattr(r,'scale',s) for r,s in zip(ro, self._scale)]
                _, ev0, ev1 = self.oracle.update(ro_exp=ro,
                                                 policy=self.policy)

        with timed('Compute policy gradient'):
            g = self.oracle.grad(self.policy)

        with timed('Policy update'):
            self.learner.update(g)
            self.policy.variable = self.learner.x

        # log
        logz.log_tabular('stepsize', self.learner.stepsize)
        logz.log_tabular('std', np.mean(np.exp(2.*self.policy.lstd)))
        logz.log_tabular('g_norm', np.linalg.norm(g))
        logz.log_tabular('ExplainVarianceBefore(AE)', ev0)
        logz.log_tabular('ExplainVarianceAfter(AE)', ev1)
        if self._use_cv:
            logz.log_tabular('NumberOfExpertRollouts', len(ro_exp))
            logz.log_tabular('NumberOfLearnerRollouts', len(ro_pol))
        else:
            logz.log_tabular('NumberOfExpertRollouts', len(ro))


        # reset
        self._reset_pi_ro()
        self._itr+=1