예제 #1
0
 def gen_ro(self, log_prefix='', to_log=False):
     ro = self._gen_ro()
     self._ndata += ro.n_samples
     if to_log:
         log_rollout_info(ro, prefix=log_prefix)
         logz.log_tabular(log_prefix + 'NumberOfDataPoints', self._ndata)
     return ro
예제 #2
0
def timed(msg):
    print(colorize(msg, color='magenta'), end='', flush=True)
    tstart = time.perf_counter()
    yield
    t = time.perf_counter() - tstart
    print(colorize(" in %.3f seconds" % (t), color='magenta'))
    logz.log_tabular(msg + ' Time', t)
 def _update_func_approx(self, x, y, w, to_log=False, log_prefix=''):
     """ Update the function approximator based on the current data (x, y,
     w) or through self._agg_data which is up-to-date with (x, y, w). """
     # initial loss
     loss_before = self._compute_loss(x, y,
                                      w)  # just on the current sample?
     explained_variance_before = math_utils.compute_explained_variance(
         self.predict(x), y)
     # optimization
     self.prepare_for_update(x)
     x_agg, y_agg, w_agg = self._agg_data['x'], self._agg_data[
         'y'], self._agg_data['w']
     lr = self._update_with_lr_search(
         x_agg, y_agg, w_agg)  # using aggregated data to update
     # new loss
     loss_after = self._compute_loss(x, y, w)
     explained_variance_after = math_utils.compute_explained_variance(
         self.predict(x), y)
     if to_log:
         logz.log_tabular(
             'LossBefore({}){}'.format(self.name, log_prefix),
             loss_before)
         logz.log_tabular(
             'LossAfter({}){}'.format(self.name, log_prefix),
             loss_after)
         logz.log_tabular(
             'ExplainedVarianceBefore({}){}'.format(
                 self.name, log_prefix), explained_variance_before)
         logz.log_tabular(
             'ExplainedVarianceAfter({}){}'.format(
                 self.name, log_prefix), explained_variance_after)
         logz.log_tabular(
             'UsedLearningRate({}){}'.format(self.name, log_prefix), lr)
예제 #4
0
    def run_alg(self, n_itrs, save_policy=True, save_policy_fun=None, save_freq=3,
                save_value_fun=None, save_sim_fun=None,
                pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            if rollout_kwargs is None:
                gr = self._gen_ro_raw
            elif (rollout_kwargs['max_n_rollouts'] is None and
                  rollout_kwargs['min_n_samples'] is None):
                gr = self._gen_ro_raw
            else:
                gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs)
            self._alg.pretrain(gr, **other_pretrain_kwargs)

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(to_log=True)
            
            # algorithm-specific
            if save_policy and isinstance(save_freq, int) and itr % save_freq == 0:
                mean_val = logz.get_val_from_LOG('MeanSumOfRewards')
                prefix = 'iter_{}_eval_'.format(itr) + '%.0f' % mean_val
                save_policy_fun(prefix + '_pi')
                save_value_fun(prefix + '_vfn')
                save_sim_fun(prefix + 'sim')
            self._alg.update(ro, gen_env_ro=self._gen_ro) 
            logz.dump_tabular()  # dump log

        # Save the final policy.
        if save_policy:
            save_policy_fun('final')
            cprint('Final policy has been saved.')
예제 #5
0
    def _update(self, env_ro, gen_env_ro):
        # gen_env_ro is just used for computing gradient std.
        assert gen_env_ro is not None

        '''Set _ro to self._or '''
        with timed('Update Oracle'):
            # self.set_ro(env_ro)
            self._or.update(env_ro, update_nor=True, to_log=True, itr=self._itr)
        with timed('Compute Grad'):
            grads = self._or.compute_grad(ret_comps=True)
            grad = grads[0]
            if self.gradients is None:
                self.gradients = grad
            else:
                self.gradients = np.concatenate([self.gradients, grad], axis=0)

            names = ['g', 'mc_g', 'ac_os', 'tau_os', 'dr_grad_os']
            for g, name in zip(grads, names):
                logz.log_tabular('norm_{}'.format(name), la.norm(g))

            self.accum_ac += grads[2]
            self.accum_tau += grads[3]
            self.accum_func += grads[4]
            logz.log_tabular('norm_accum_ac_os', la.norm(self.accum_ac / self.gradients.shape[0]))
            logz.log_tabular('norm_accum_tau_os', la.norm(self.accum_tau / self.gradients.shape[0]))
            logz.log_tabular('norm_accum_func_os', la.norm(self.accum_func / self.gradients.shape[0]))

        self._itr += 1
        logz.log_tabular('std', np.mean(self._policy.std))
예제 #6
0
    def update(self,
               ro,
               update_nor=False,
               shift_adv=False,
               to_log=False,
               log_prefix=''):
        """
            Args:
                ro: RO object representing the new information
                update_nor: whether to update the  control variate of tfLikelihoodRatioOracle
                shift_adv: whether to force the adv values to be positive. if float, it specifies the
                    amount to shift.
        """

        self._ro = ro  # save the ref to rollouts

        # Compute adv.
        advs, vfns = self._ae.advs(ro)  # adv has its own ref_policy
        adv = np.concatenate(advs)
        if shift_adv:  # make adv non-negative
            assert self._use_log_loss
            if shift_adv is True:
                adv = adv - np.min(adv)
            else:
                adv = adv - np.mean(adv) + shift_adv
            self._nor.reset()  # defined in tfLikelihoodRatioOracle
            update_nor = False

        if not self._normalize_weighting:
            if self._avg_type == 'sum':  # rescale the problem if needed
                adv *= len(adv) / len(ro)

        # Update the loss function.
        if self._use_log_loss is True:
            #  - E_{ob} E_{ac ~ q | ob} [ w * log p(ac|ob) * adv(ob, ac) ]
            if self._onestep_weighting:  # consider importance weight
                w_or_logq = np.concatenate(
                    self._ae.weights(ro,
                                     policy=self.policy))  # helper function
            else:
                w_or_logq = np.ones_like(adv)
        else:  # False or None
            #  - E_{ob} E_{ac ~ q | ob} [ p(ac|ob)/q(ac|ob) * adv(ob, ac) ]
            assert self._onestep_weighting
            w_or_logq = ro.lps

        if to_log:
            vfn = np.concatenate(vfns)
            logz.log_tabular('max_adv', np.amax(np.abs(adv)))
            logz.log_tabular('max_vfn', np.amax(np.abs(vfn)))

        # Update the tfLikelihoodRatioOracle.
        super().update(-adv, w_or_logq, [ro.obs, ro.acs],
                       update_nor)  # loss is negative reward
예제 #7
0
    def run_alg(self,
                n_itrs,
                pretrain=True,
                save_policy=False,
                save_freq=100,
                final_eval=False):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            self._alg.pretrain(functools.partial(self.gen_ro, to_log=False))

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(self._alg.pi_ro,
                                 logp=self._alg.logp,
                                 to_log=True)
            self._alg.update(ro)  # algorithm-specific
            logz.dump_tabular()  # dump log
예제 #8
0
    def cal_variance(self, n_itrs, save_policy=True, save_value_fun=None, 
                save_policy_fun=None, save_freq=3,
                save_sim_fun=None,
                ro_file=None,
                save_np_file_path=None,
                prefix=None,
                save_gradient=None,
                pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs):
        start_time = time.time()
        
        assert prefix is not None
        assert ro_file is not None
        assert save_np_file_path is not None

        save_grad_frequency = 1

        ro_path = ro_file

        with open(ro_path, 'rb') as f:
            ros = pickle.load(f)

        # Main loop        
        itr = 0
        self._alg.reset_grads()
        while True:
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            
            self._alg.compute_grad(ros.pop(0), gen_env_ro=self._gen_ro)

            logz.dump_tabular()  # dump log
            
            if itr % save_grad_frequency == 0:
                np.save(save_np_file_path, self._alg.gradients)

            if ros == []:
                break
            
            itr += 1
예제 #9
0
    def run_alg(self,
                n_itrs,
                save_policy=None,
                save_policy_fun=None,
                save_freq=None,
                pretrain=True,
                rollout_kwargs=None,
                **other_pretrain_kwargs):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            if rollout_kwargs is None:
                gr = self._gen_ro_raw
            elif (rollout_kwargs['max_n_rollouts'] is None
                  and rollout_kwargs['min_n_samples'] is None):
                gr = self._gen_ro_raw
            else:
                gr = functools.partial(generate_rollout,
                                       env=self._env,
                                       **rollout_kwargs)
            self._alg.pretrain(gr, **other_pretrain_kwargs)

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(to_log=True)
            # algorithm-specific
            self._alg.update(ro, gen_env_ro=self._gen_ro)
            logz.dump_tabular()  # dump log
            if save_policy and isinstance(save_freq,
                                          int) and itr % save_freq == 0:
                save_policy_fun('{}'.format(itr))

        # Save the final policy.
        if save_policy:
            save_policy_fun('final')
            cprint('Final policy has been saved.')
예제 #10
0
    def est_mean(self, n_itrs, save_policy=True, save_value_fun=None, 
                save_policy_fun=None, save_freq=3,
                save_sim_fun=None,
                save_gradient=None,
                save_np_file_path=None,
                pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs):
        start_time = time.time()        

        # Main loop
        # default estimator number is 10000
        for itr in range(2000):
            ro = self._gen_ro()       

            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)

            # algorithm-specific
            self._alg.compute_grad(ro, gen_env_ro=self._gen_ro)
            logz.dump_tabular()  # dump log

        mean_st = self._alg.gradients       
        est_mean = np.mean(mean_st, axis=0, keepdims=True)     
        np.save(save_np_file_path, est_mean)
예제 #11
0
    def update(self, g=None, to_log=False, *args, **kwargs):
        assert g is not None
        # Compute V.
        if self.w is None:  # initialization (V is not needed)
            self.dim = g.shape[0]
            self.V = self._compute_V()  # XXX for logging
        else:
            assert self.V is not None  # make sure compute_grad has been queried
            pred_error_size = la.norm(np.dot(self.V, self.w) - g)

        # Update the most recent oracle using new samples (rotate right).
        oracle = self._base_oracles.pop()  # pop the most right element
        oracle.update(to_log=to_log, *args, **kwargs)
        self._base_oracles.appendleft(oracle)
        if self.n_valid_base_oracles < self.n_base_oracles:
            self.n_valid_base_oracles += 1
        # Regression using true grads
        if self.mode == 'average':
            self.w = np.zeros(self.n_base_oracles)
            self.w[:self.
                   n_valid_base_oracles] = 1.0 / self.n_valid_base_oracles
        elif self.mode == 'recent':
            self.w = np.zeros(self.n_base_oracles)
            self.w[0] = 1.0
        else:
            if self.w is None:  # initialization. cst * 1/2 * (w - e_1)^2
                self.w = np.zeros(self.n_base_oracles)
                self.w[0] = 1.0
                self.A = (self.reg_factor * la.norm(g)**2 /
                          self.n_base_oracles) * np.eye(self.n_base_oracles)
                self.b = np.dot(self.A, self.w)
            else:
                self.A = (1.0 - self.mode) * self.A + np.matmul(
                    self.V.T, self.V)
                self.b = (1.0 - self.mode) * self.b + np.matmul(self.V.T, g)
                self.w = la.solve(self.A, self.b)
                self.w = np.clip(self.w, 0.0, 2.0)  # XXX

        if to_log:
            logz.log_tabular('min_weights', np.min(self.w))
            logz.log_tabular('max_weights', np.max(self.w))
            logz.log_tabular('norm_weights', la.norm(self.w))

        # Reset V.
        self.V = None
예제 #12
0
    def _update(self, env_ro, gen_env_ro):
        # gen_env_ro is just used for computing gradient std.
        assert gen_env_ro is not None

        # XXX If using simulation to train vf, vf should be updated after policy nor is updated.
        if self.gen_sim_ro is not None:
            with timed('Generate sim data'):
                sim_ro = self.gen_sim_ro()
            with timed('Update ae'):
                self._or.update_ae(sim_ro,
                                   to_log=True)  # update value function

        if self.log_sigmas_freq is not None and self._itr % self.log_sigmas_freq == 0:
            with timed('Compute Sigmas'):
                self._or.log_sigmas(**self.log_sigmas_kwargs)

        with timed('Update Oracle'):
            self._or.update(env_ro,
                            update_nor=True,
                            to_log=True,
                            itr=self._itr)
        with timed('Compute Grad'):
            grads = self._or.compute_grad(ret_comps=True)
            grad = grads[0]
            names = ['g', 'mc_g', 'ac_os', 'tau_os']
            for g, name in zip(grads, names):
                logz.log_tabular('norm_{}'.format(name), la.norm(g))
        with timed('Take Gradient Step'):
            self._learner.update(grad,
                                 self._or.ro)  # take the grad with the env_ro
        if self.gen_sim_ro is None:
            with timed('Update ae'):
                self._or.update_ae(env_ro,
                                   to_log=True)  # update value function
        # Always update dynamics using true data.
        with timed('Update dyn'):
            self._or.update_dyn(env_ro, to_log=True)  # update dynamics
        with timed('Update rw'):
            self._or.update_rw(env_ro, to_log=True)
        self._itr += 1
        logz.log_tabular('online_learner_stepsize', self._learner.stepsize)
        logz.log_tabular('std', np.mean(self._policy.std))
예제 #13
0
def log_rollout_info(ro, prefix=''):
    # print('Logging rollout info')
    if not hasattr(log_rollout_info, "total_n_samples"):
        log_rollout_info.total_n_samples = {}  # static variable
    if prefix not in log_rollout_info.total_n_samples:
        log_rollout_info.total_n_samples[prefix] = 0
    sum_of_rewards = [rollout.rws.sum() for rollout in ro.rollouts]
    rollout_lens = [len(rollout) for rollout in ro.rollouts]
    n_samples = sum(rollout_lens)
    log_rollout_info.total_n_samples[prefix] += n_samples
    logz.log_tabular(prefix + "NumSamplesThisBatch", n_samples)
    logz.log_tabular(prefix + "NumberOfRollouts", len(ro))
    logz.log_tabular(prefix + "TotalNumSamples",
                     log_rollout_info.total_n_samples[prefix])
    logz.log_tabular(prefix + "MeanSumOfRewards", np.mean(sum_of_rewards))
    logz.log_tabular(prefix + "StdSumOfRewards", np.std(sum_of_rewards))
    logz.log_tabular(prefix + "MaxSumOfRewards", np.max(sum_of_rewards))
    logz.log_tabular(prefix + "MinSumOfRewards", np.min(sum_of_rewards))
    logz.log_tabular(prefix + "MeanRolloutLens", np.mean(rollout_lens))
    logz.log_tabular(prefix + "StdRolloutLens", np.std(rollout_lens))
    logz.log_tabular(
        prefix + "MeanOfRewards",
        np.sum(sum_of_rewards) / (n_samples + len(sum_of_rewards)))
예제 #14
0
    def update(self, ro):

        self._ro = ro
        if not self._ignore_samples:
            # update input normalizer for whitening
            self._policy.prepare_for_update(self._ro.obs)

            # Correction Step (Model-free)
            self._correction()

        # end of round
        self._itr += 1

        # log
        logz.log_tabular('pcl_stepsize', self._pcl.stepsize)
        logz.log_tabular('std', np.mean(self._policy.std))
        if not self._ignore_samples:
            logz.log_tabular('true_grads_size', np.linalg.norm(self._g))
            logz.log_tabular('pred_grads_size',
                             np.linalg.norm(self._pcl.g_hat))
            pred_error_size = np.linalg.norm(self._g - self._pcl.g_hat)
            ratio = pred_error_size / np.linalg.norm(self._g)
            logz.log_tabular('pred_error_size', pred_error_size)
            logz.log_tabular('pred_error_true_ratio', ratio)

        # Prediction Step (Model-based)
        if self._w_pred:
            self._prediction()

        # log
        logz.log_tabular('std_after', np.mean(self._policy.std))
예제 #15
0
    def log_sigmas(self,
                   idx=100,
                   n_ros=30,
                   n_acs=30,
                   n_taus=30,
                   n_steps=None,
                   use_vf=False):
        # Estimate the vairance of G_idx for different cvs for comparison.
        # n_steps, rollout for max n_steps for tau.
        # use_vf: use value function to reduce the variance in estimate E_a E_tau NQ.

        # Collect samples.
        # Data structure:
        #   sts: 2d array.
        #   acs: 3d array.
        #   advs (advantage function): 3d array.
        #   N (log probability gradient): 3d array.

        # XXX
        # Use state baseline to reduce the variance of the estimates.
        ro = self.gen_ro(max_n_rollouts=n_ros, max_rollout_len=idx + 1)
        sts = np.array([r.obs[idx] for r in ro.rollouts if len(r) > idx])
        n_sts = len(sts)

        if n_sts == 0:
            log = {
                'sigma_s_mc': .0,
                'sigma_a_mc': .0,
                'sigma_tau_mc': .0,
                'n_ros_in_total': n_sts * n_acs * n_taus,
                'n_sts': n_sts,
            }
        else:
            acs = self.policy.pi(np.repeat(sts, n_acs, axis=0))
            acs = np.reshape(acs, [n_sts, n_acs, -1])
            Q = np.zeros((n_ros, n_acs, n_taus))
            N_dim = len(self.policy.logp_grad(ro.obs[0], ro.acs[0]))
            N = np.zeros((n_ros, n_acs, N_dim))
            decay = self.ae._pe.gamma * self.delta
            for i, s in enumerate(sts):
                for j, a in enumerate(acs[i]):
                    # This should be the bottleneck!!
                    ro = self.gen_ro(max_n_rollouts=n_taus,
                                     max_rollout_len=n_steps,
                                     start_state=s,
                                     start_action=a)
                    N[i, j] = self.policy.logp_grad(s, a)
                    for k, r in enumerate(ro.rollouts):
                        q0 = ((decay**np.arange(len(r))) * r.rws).sum()
                        Q[i, j, k] = q0

            # Fill the rest with zeros.
            if use_vf:
                V = np.zeros((n_ros))
                for i, s in enumerate(sts):
                    V[i] = self.ae._vfn.predict(s[None])[0]

            def compute_sigma_s(Q):
                E_tau_Q = np.mean(Q, axis=2)  # s x a
                if use_vf:
                    E_tau_Q -= np.expand_dims(V, axis=-1)  # s x 1
                E_tau_Q = np.expand_dims(E_tau_Q, axis=-1)  # s x a x 1
                E_a_tau_NQ = np.mean(E_tau_Q * N, axis=1)  # s x N
                E_s_a_tau_NQ = np.mean(E_a_tau_NQ, axis=0)  # N
                E_s_a_tau_NQ = np.expand_dims(E_s_a_tau_NQ, axis=0)  # 1 x N
                Var = np.mean(np.square(E_a_tau_NQ - E_s_a_tau_NQ),
                              axis=0)  # N
                sigma = np.sqrt(np.sum(Var))

                return sigma

            def compute_sigma_a(Q):
                E_tau_Q = np.mean(Q, axis=2)  # s x a
                E_tau_Q = np.expand_dims(E_tau_Q, axis=-1)  # s x a x 1
                N_E_tau_Q = N * E_tau_Q  # s x a x N
                if use_vf:
                    N_E_tau_Q_for_E_a = N * (E_tau_Q -
                                             np.reshape(V, V.shape + (1, 1)))
                else:
                    N_E_tau_Q_for_E_a = N_E_tau_Q
                E_a_N_E_tau_Q = np.mean(N_E_tau_Q_for_E_a, axis=1)  # s x N
                E_a_N_E_tau_Q = np.expand_dims(E_a_N_E_tau_Q,
                                               axis=1)  # s x 1 x N
                Var = np.mean(np.square(N_E_tau_Q - E_a_N_E_tau_Q),
                              axis=1)  # s x N
                sigma = np.sqrt(np.sum(np.mean(Var, axis=0)))

                return sigma

            def compute_sigma_tau(Q):
                E_tau_Q = np.mean(Q, axis=2)  # s x a
                E_tau_Q = np.expand_dims(E_tau_Q, axis=-1)  # s x a x 1
                Var = np.mean(np.square(Q - E_tau_Q), axis=2)  # s x a
                Var = np.expand_dims(Var, axis=-1)  # s x a x 1
                sigma = np.sqrt(
                    np.sum(np.mean(np.square(N) * Var, axis=(0, 1))))
                return sigma

            log = {
                'sigma_s_mc': compute_sigma_s(Q),
                'sigma_a_mc': compute_sigma_a(Q),
                'sigma_tau_mc': compute_sigma_tau(Q),
                'n_ros_in_total': n_sts * n_acs * n_taus,
                'n_sts': n_sts,
            }

        for k, v in log.items():
            logz.log_tabular(k, v)