示例#1
0
    def _update(self, env_ro, gen_env_ro):
        # gen_env_ro is just used for computing gradient std.
        assert gen_env_ro is not None

        '''Set _ro to self._or '''
        with timed('Update Oracle'):
            # self.set_ro(env_ro)
            self._or.update(env_ro, update_nor=True, to_log=True, itr=self._itr)
        with timed('Compute Grad'):
            grads = self._or.compute_grad(ret_comps=True)
            grad = grads[0]
            if self.gradients is None:
                self.gradients = grad
            else:
                self.gradients = np.concatenate([self.gradients, grad], axis=0)

            names = ['g', 'mc_g', 'ac_os', 'tau_os', 'dr_grad_os']
            for g, name in zip(grads, names):
                logz.log_tabular('norm_{}'.format(name), la.norm(g))

            self.accum_ac += grads[2]
            self.accum_tau += grads[3]
            self.accum_func += grads[4]
            logz.log_tabular('norm_accum_ac_os', la.norm(self.accum_ac / self.gradients.shape[0]))
            logz.log_tabular('norm_accum_tau_os', la.norm(self.accum_tau / self.gradients.shape[0]))
            logz.log_tabular('norm_accum_func_os', la.norm(self.accum_func / self.gradients.shape[0]))

        self._itr += 1
        logz.log_tabular('std', np.mean(self._policy.std))
示例#2
0
    def run_alg(self, n_itrs, save_policy=True, save_policy_fun=None, save_freq=3,
                save_value_fun=None, save_sim_fun=None,
                pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            if rollout_kwargs is None:
                gr = self._gen_ro_raw
            elif (rollout_kwargs['max_n_rollouts'] is None and
                  rollout_kwargs['min_n_samples'] is None):
                gr = self._gen_ro_raw
            else:
                gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs)
            self._alg.pretrain(gr, **other_pretrain_kwargs)

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(to_log=True)
            
            # algorithm-specific
            if save_policy and isinstance(save_freq, int) and itr % save_freq == 0:
                mean_val = logz.get_val_from_LOG('MeanSumOfRewards')
                prefix = 'iter_{}_eval_'.format(itr) + '%.0f' % mean_val
                save_policy_fun(prefix + '_pi')
                save_value_fun(prefix + '_vfn')
                save_sim_fun(prefix + 'sim')
            self._alg.update(ro, gen_env_ro=self._gen_ro) 
            logz.dump_tabular()  # dump log

        # Save the final policy.
        if save_policy:
            save_policy_fun('final')
            cprint('Final policy has been saved.')
示例#3
0
    def pretrain(self, gen_ro):
        with timed('Pretraining'):
            # Implement necessary pretraining procedures here.
            if isinstance(self._or, Or.tfPolicyGradient):
                self._ro = gen_ro(self.pi, logp=self.logp)
                self._or.update_ae(self._ro)

            # take a prediction step first
            if self._take_first_pred and self._w_pred:
                self._prediction()
示例#4
0
    def _correction(self):
        # single first-order update
        with timed('Update oracle'):
            self._or.update(self._ro, update_nor=True)
            if callable(getattr(self._or, 'update_ae')):
                self._or.update_ae(self._ro, to_log=True)

        with timed('Compute policy gradient'):
            g = self._or.compute_grad()
            self._g = g

        if self._w_corr:
            if self._update_rule in ['dyna', 'model_free']:
                self._pcl.clear_g_hat()  # make sure hat_g is None
            with timed('Take piccolo correction step'):
                kwargs = {}
                if isinstance(self._pcl, rlPiccoloFisher):
                    kwargs['ro'] = self._ro
                self._pcl.update(g, 'correct', **kwargs)
示例#5
0
                    def callback():
                        with timed('Update model oracle (callback)'):
                            self._mor.update(update_nor=True,
                                             update_ae=update_ae_and_nor,
                                             update_pol_nor=update_ae_and_nor)

                            method = getattr(self._pcl, 'method', None)
                            if isinstance(method, rlPiccoloFisher):
                                method.assign(
                                    self._policy)  # sync the normalizer
                                method.ro = self._mor.ro
                            if isinstance(self._pcl, rlPiccoloFisher):
                                self._pcl._reg_swp.update(self._mor.ro.obs)
示例#6
0
    def pretrain(self, gen_ro, n_vf_updates=1, n_dyn_updates=1, n_rw_updates=1,
                 update_pol_nor=True, **kwargs):

        with timed('Pretraining'):
            ro = gen_ro(self.pi, logp=self.logp)
            if update_pol_nor:
                self._policy.prepare_for_update(ro.obs)  # update nor of policy
            ro = gen_ro(self.pi, logp=self.logp)
            for _ in range(n_vf_updates):
                self._or.update_ae(ro)
            for _ in range(n_dyn_updates):
                self._or.update_dyn(ro)
            for _ in range(n_rw_updates):
                self._or.update_rw(ro)
示例#7
0
    def _update(self, env_ro, gen_env_ro):
        # gen_env_ro is just used for computing gradient std.
        assert gen_env_ro is not None

        # XXX If using simulation to train vf, vf should be updated after policy nor is updated.
        if self.gen_sim_ro is not None:
            with timed('Generate sim data'):
                sim_ro = self.gen_sim_ro()
            with timed('Update ae'):
                self._or.update_ae(sim_ro,
                                   to_log=True)  # update value function

        if self.log_sigmas_freq is not None and self._itr % self.log_sigmas_freq == 0:
            with timed('Compute Sigmas'):
                self._or.log_sigmas(**self.log_sigmas_kwargs)

        with timed('Update Oracle'):
            self._or.update(env_ro,
                            update_nor=True,
                            to_log=True,
                            itr=self._itr)
        with timed('Compute Grad'):
            grads = self._or.compute_grad(ret_comps=True)
            grad = grads[0]
            names = ['g', 'mc_g', 'ac_os', 'tau_os']
            for g, name in zip(grads, names):
                logz.log_tabular('norm_{}'.format(name), la.norm(g))
        with timed('Take Gradient Step'):
            self._learner.update(grad,
                                 self._or.ro)  # take the grad with the env_ro
        if self.gen_sim_ro is None:
            with timed('Update ae'):
                self._or.update_ae(env_ro,
                                   to_log=True)  # update value function
        # Always update dynamics using true data.
        with timed('Update dyn'):
            self._or.update_dyn(env_ro, to_log=True)  # update dynamics
        with timed('Update rw'):
            self._or.update_rw(env_ro, to_log=True)
        self._itr += 1
        logz.log_tabular('online_learner_stepsize', self._learner.stepsize)
        logz.log_tabular('std', np.mean(self._policy.std))
示例#8
0
    def run_alg(self,
                n_itrs,
                pretrain=True,
                save_policy=False,
                save_freq=100,
                final_eval=False):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            self._alg.pretrain(functools.partial(self.gen_ro, to_log=False))

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(self._alg.pi_ro,
                                 logp=self._alg.logp,
                                 to_log=True)
            self._alg.update(ro)  # algorithm-specific
            logz.dump_tabular()  # dump log
示例#9
0
    def run_alg(self,
                n_itrs,
                save_policy=None,
                save_policy_fun=None,
                save_freq=None,
                pretrain=True,
                rollout_kwargs=None,
                **other_pretrain_kwargs):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            if rollout_kwargs is None:
                gr = self._gen_ro_raw
            elif (rollout_kwargs['max_n_rollouts'] is None
                  and rollout_kwargs['min_n_samples'] is None):
                gr = self._gen_ro_raw
            else:
                gr = functools.partial(generate_rollout,
                                       env=self._env,
                                       **rollout_kwargs)
            self._alg.pretrain(gr, **other_pretrain_kwargs)

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(to_log=True)
            # algorithm-specific
            self._alg.update(ro, gen_env_ro=self._gen_ro)
            logz.dump_tabular()  # dump log
            if save_policy and isinstance(save_freq,
                                          int) and itr % save_freq == 0:
                save_policy_fun('{}'.format(itr))

        # Save the final policy.
        if save_policy:
            save_policy_fun('final')
            cprint('Final policy has been saved.')
示例#10
0
def time_batch_env(envid, n_envs):
    seed = 0
    n_ro = 5000
    e = envs.create_env(envid, seed)

    def pi(obs):
        ac = e.action_space.sample()
        ac = [ac for _ in range(len(obs))]
        return ac

    # env = envs.create_batch_env(envid, seed, 1, use_ext_proc=False)
    # roller = Roller(env, min_n_samples=None, max_n_rollouts=n_ro, max_rollout_len=None)
    # with timed('1 env generate {} ros'.format(n_ro)):
    #     roller.gen_ro(pi=pi, logp=None)

    # env = envs.create_batch_env(envid, seed, n_envs, use_ext_proc=True)
    # roller = Roller(env, min_n_samples=None, max_n_rollouts=n_ro, max_rollout_len=None)
    # with timed('{} envs parallel generating {} ros'.format(n_envs, n_ro)):
    #     roller.gen_ro(pi=pi, logp=None)

    e = envs.create_batch_env(envid, seed, 1, use_ext_proc=False)
    with timed(''):
        generate_rollout(lambda ob: e.action_space.sample(),
                         None, e, min_n_samples=None, max_n_rollouts=n_ro, max_rollout_len=None)
示例#11
0
    def _prediction(self):
        # (multi-step) update using model-information

        with timed('Update model oracle'):
            # flags
            shift_adv = self._shift_adv and isinstance(self._pcl, PiccoloOpt)
            # if to update pol_nor and ae in model update
            update_ae_and_nor = self._pre_w_adap or self._update_in_pred

            # mimic the oracle update
            kwargs = {'update_nor': True, 'to_log': True}
            if isinstance(self._mor, Or.SimulationOracle):
                kwargs['update_ae'] = update_ae_and_nor
                kwargs['update_pol_nor'] = update_ae_and_nor
            elif (isinstance(self._mor, Or.LazyOracle)
                  or isinstance(self._mor, Or.AggregatedOracle)
                  or isinstance(self._mor, Or.AdversarialOracle)):
                kwargs['shift_adv'] = shift_adv
            elif isinstance(self._mor, Or.DummyOracle):
                kwargs['g'] = self._g
            else:
                raise NotImplementedError(
                    'Model oracle update is not implemented.')
            self._mor.update(ro=self._ro, **kwargs)

        with timed('Compute model gradient'):
            g_hat = self._mor.compute_grad()

        with timed('Take piccolo prediction step'):
            kwargs = {}
            if isinstance(self._pcl, rlPiccoloFisher):
                kwargs['ro'] = self._mor.ro

            if isinstance(self._pcl, PiccoloOpt):
                # need to define the optimization problem
                kwargs['grad_hat'] = self._mor.compute_grad
                kwargs['loss_hat'] = self._mor.compute_loss
                kwargs['warm_start'] = self._warm_start
                kwargs['stop_std_grad'] = self._stop_std_grad
                if isinstance(self._mor, Or.SimulationOracle):

                    def callback():
                        with timed('Update model oracle (callback)'):
                            self._mor.update(update_nor=True,
                                             update_ae=update_ae_and_nor,
                                             update_pol_nor=update_ae_and_nor)

                            method = getattr(self._pcl, 'method', None)
                            if isinstance(method, rlPiccoloFisher):
                                method.assign(
                                    self._policy)  # sync the normalizer
                                method.ro = self._mor.ro
                            if isinstance(self._pcl, rlPiccoloFisher):
                                self._pcl._reg_swp.update(self._mor.ro.obs)

                    kwargs['callback'] = callback

            # adapt for 'dyna' and 'model-based'
            self._pcl.update(g_hat,
                             'predict',
                             adapt=self._pre_w_adap,
                             **kwargs)