Пример #1
0
    def dump_tabular(self):
        """
        Write all of the diagnostics from the current iteration.

        Writes both to stdout, and to the output file.
        """
        if proc_id() == 0:
            vals = []
            key_lens = [len(key) for key in self.log_headers]
            max_key_len = max(15, max(key_lens))
            keystr = '%' + '%d' % max_key_len
            fmt = "| " + keystr + "s | %15s |"
            n_slashes = 22 + max_key_len
            print("-" * n_slashes)
            for key in self.log_headers:
                val = self.log_current_row.get(key, "")
                valstr = "%8.3g" % val if hasattr(val, "__float__") else val
                print(fmt % (key, valstr))
                vals.append(val)
            print("-" * n_slashes, flush=True)
            if self.output_file is not None:
                if self.first_row:
                    self.output_file.write("\t".join(self.log_headers) + "\n")
                self.output_file.write("\t".join(map(str, vals)) + "\n")
                self.output_file.flush()
        self.log_current_row.clear()
        self.first_row = False
Пример #2
0
    def save_state(self, state_dict, itr=None):
        """
        Saves the state of an experiment.

        To be clear: this is about saving *state*, not logging diagnostics.
        All diagnostic logging is separate from this function. This function
        will save whatever is in ``state_dict``---usually just a copy of the
        environment---and the most recent parameters for the model you 
        previously set up saving for with ``setup_tf_saver``. 

        Call with any frequency you prefer. If you only want to maintain a
        single state and overwrite it at each call with the most recent 
        version, leave ``itr=None``. If you want to keep all of the states you
        save, provide unique (increasing) values for 'itr'.

        Args:
            state_dict (dict): Dictionary containing essential elements to
                describe the current state of training.

            itr: An int, or None. Current iteration of training.
        """
        if proc_id() == 0:
            fname = 'vars.pkl' if itr is None else 'vars%d.pkl' % itr
            try:
                joblib.dump(state_dict, osp.join(self.output_dir, fname))
            except:
                self.log('Warning: could not pickle state_dict.', color='red')
            if hasattr(self, 'tf_saver_elements'):
                self._tf_simple_save(itr)
            if hasattr(self, 'pytorch_saver_elements'):
                self._pytorch_simple_save(itr)
Пример #3
0
    def save_config(self, config):
        """
        Log an experiment configuration.

        Call this once at the top of your experiment, passing in all important
        config vars as a dict. This will serialize the config to JSON, while
        handling anything which can't be serialized in a graceful way (writing
        as informative a string as possible). 

        Example use:

        .. code-block:: python

            logger = EpochLogger(**logger_kwargs)
            logger.save_config(locals())
        """
        config_json = convert_json(config)
        if self.exp_name is not None:
            config_json['exp_name'] = self.exp_name
        if proc_id() == 0:
            output = json.dumps(config_json,
                                separators=(',', ':\t'),
                                indent=4,
                                sort_keys=True)
            print(colorize('Saving config:\n', color='cyan', bold=True))
            print(output)
            with open(osp.join(self.output_dir, "config.json"), 'w') as out:
                out.write(output)
Пример #4
0
 def _pytorch_simple_save(self, itr=None):
     """
     Saves the PyTorch model (or models).
     """
     if proc_id() == 0:
         assert hasattr(self, 'pytorch_saver_elements'), \
             "First have to setup saving with self.setup_pytorch_saver"
         fpath = 'pyt_save'
         fpath = osp.join(self.output_dir, fpath)
         fname = 'model' + ('%d' % itr if itr is not None else '') + '.pt'
         fname = osp.join(fpath, fname)
         os.makedirs(fpath, exist_ok=True)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             torch.save(self.pytorch_saver_elements, fname)
Пример #5
0
 def _tf_simple_save(self, itr=None):
     """
     Uses simple_save to save a trained model, plus info to make it easy
     to associated tensors to variables after restore. 
     """
     if proc_id() == 0:
         assert hasattr(self, 'tf_saver_elements'), \
             "First have to setup saving with self.setup_tf_saver"
         fpath = 'tf1_save' + ('%d' % itr if itr is not None else '')
         fpath = osp.join(self.output_dir, fpath)
         if osp.exists(fpath):
             # simple_save refuses to be useful if fpath already exists,
             # so just delete fpath if it's there.
             shutil.rmtree(fpath)
         tf.saved_model.simple_save(export_dir=fpath,
                                    **self.tf_saver_elements)
         joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl'))
Пример #6
0
    def __init__(self,
                 output_dir=None,
                 output_fname='progress.txt',
                 exp_name=None):
        """
        Initialize a Logger.

        Args:
            output_dir (string): A directory for saving results to. If 
                ``None``, defaults to a temp directory of the form
                ``/tmp/experiments/somerandomnumber``.

            output_fname (string): Name for the tab-separated-value file 
                containing metrics logged throughout a training run. 
                Defaults to ``progress.txt``. 

            exp_name (string): Experiment name. If you run multiple training
                runs and give them all the same ``exp_name``, the plotter
                will know to group them. (Use case: if you run the same
                hyperparameter configuration with multiple random seeds, you
                should give them all the same ``exp_name``.)
        """
        if proc_id() == 0:
            self.output_dir = output_dir or "/tmp/experiments/%i" % int(
                time.time())
            if osp.exists(self.output_dir):
                print(
                    "Warning: Log dir %s already exists! Storing info there anyway."
                    % self.output_dir)
            else:
                os.makedirs(self.output_dir)
            self.output_file = open(osp.join(self.output_dir, output_fname),
                                    'w')
            atexit.register(self.output_file.close)
            print(
                colorize("Logging data to %s" % self.output_file.name,
                         'green',
                         bold=True))
        else:
            self.output_dir = None
            self.output_file = None
        self.first_row = True
        self.log_headers = []
        self.log_current_row = {}
        self.exp_name = exp_name
Пример #7
0
def LBPO(env_fn,
         env_name='',
         actor_critic=core.MLPActorCriticCost,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         clip_ratio=0.2,
         pi_lr=3e-4,
         vf_lr=1e-3,
         jf_lr=1e-3,
         penalty_init=1.,
         penalty_lr=5e-2,
         cost_lim=25,
         train_pi_iters=80,
         train_v_iters=80,
         lam=0.97,
         max_ep_len=1000,
         target_kl=0.01,
         target_l2=0.012,
         logger_kwargs=dict(),
         save_freq=10,
         beta=0.01,
         beta_thres=0.05):
    """
    Lyapunov Barrier Policy Optimization

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        env_name : Name of the environment

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        cost_lim (float): Cumulative constraint threshold that we want the agent to respect.

        target_l2 (float): Hard constraint on KL or a trust region constraint.

        beta(float): Barrier parameter to control the amount of risk aversion.

        beta(thres): Barrier parameter for gradient clipping. Set to 0.05

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    if 'Grid' in env_name:
        ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    else:
        ac = torch.load('safe_initial_policies/' + env_name + '.pt')

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.Qv1])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up penalty params
    soft_penalty = Variable(torch.exp(torch.Tensor([penalty_init])) - 1,
                            requires_grad=True)
    penalty_optimizer = torch.optim.Adam([soft_penalty], lr=penalty_lr)
    print("Beta: {} Beta threshold: {}".format(beta, beta_thres))
    constraint_violations = [0]
    constraint_violations_count = [0]

    def safe_transform(data, baseline_pi, pi, epsilon, proj_max_dist):
        # Do a line search
        max_steps = 10
        obs = data['obs']
        for step in range(max_steps):
            ls_alpha = 0.5**step

            for param1, param2, target_param in zip(
                    ac.pi.parameters(), ac.baseline_pi.parameters(),
                    ac.pi_mix.parameters()):
                target_param.data.copy_((ls_alpha) * param1.data +
                                        (1 - ls_alpha) * param2.data)

            mix_act = ac.act_pi(ac.pi_mix, obs).detach()
            epsilon_observed = ac.Qj1(torch.cat(
                (obs, mix_act), dim=1)) - ac.Qj1(
                    torch.cat((obs, ac.baseline_pi(obs)), dim=1))
            if epsilon_observed.mean() <= epsilon or step == max_steps - 1:
                for param, target_param in zip(ac.pi_mix.parameters(),
                                               ac.pi.parameters()):
                    target_param.data.copy_(param.data)

                break
        return ls_alpha

    def conjugate_gradients(Avp, b, nsteps, residual_tol=1e-10):
        x = torch.zeros(b.size())
        r = b.clone()
        p = b.clone()
        rdotr = torch.dot(r, r)
        for i in range(nsteps):
            _Avp = Avp(p)
            alpha = rdotr / torch.dot(p, _Avp)
            x += alpha * p
            r -= alpha * _Avp
            new_rdotr = torch.dot(r, r)
            betta = new_rdotr / rdotr
            p = r + betta * p
            rdotr = new_rdotr
            if rdotr < residual_tol:
                break
        return x

    def linesearch(model,
                   f,
                   x,
                   fullstep,
                   expected_improve_rate,
                   max_backtracks=10,
                   accept_ratio=.1):
        fval = f().data
        for (_n_backtracks,
             stepfrac) in enumerate(.5**np.arange(max_backtracks)):
            xnew = x + stepfrac * fullstep
            set_flat_params_to(model, xnew)
            newfval = f().data
            actual_improve = fval - newfval
            expected_improve = expected_improve_rate * stepfrac
            ratio = actual_improve / expected_improve
            if ratio.item() > accept_ratio and actual_improve.item() > 0:
                return True, xnew
        return False, x

    def trust_region_step(model, get_loss, get_kl, max_kl, damping):
        loss = get_loss()
        grads = torch.autograd.grad(loss, model.parameters())
        loss_grad = torch.cat([grad.view(-1) for grad in grads]).data

        def Fvp(v):
            kl = get_kl()
            kl = kl.mean()

            grads = torch.autograd.grad(kl,
                                        model.parameters(),
                                        create_graph=True)
            flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])

            kl_v = (flat_grad_kl * Variable(v)).sum()
            grads = torch.autograd.grad(kl_v, model.parameters())
            flat_grad_grad_kl = torch.cat(
                [grad.contiguous().view(-1) for grad in grads]).data

            return flat_grad_grad_kl + v * damping

        stepdir = conjugate_gradients(Fvp, -loss_grad, 10)

        shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True)

        lm = torch.sqrt(shs / max_kl)
        fullstep = stepdir / lm[0]

        neggdotstepdir = (-loss_grad * stepdir).sum(0, keepdim=True)
        print(("lagrange multiplier:", lm[0], "grad_norm:", loss_grad.norm()))

        prev_params = get_flat_params_from(model)
        success, new_params = linesearch(model, get_loss, prev_params,
                                         fullstep, neggdotstepdir / lm[0])
        set_flat_params_to(model, new_params)

        return loss

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data, epoch_no=1):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        def get_kl(old_mean=None, new_mean=None):
            if old_mean is None:
                mean1 = ac.pi(obs)
            else:
                mean1 = old_mean

            log_std1, std1 = -2.99, 0.05
            if new_mean is None:
                mean0 = torch.autograd.Variable(mean1.data)
            else:
                mean0 = new_mean
            log_std0 = -2.99
            std0 = 0.05
            kl = log_std1 - log_std0 + (std0**2 + (mean0 - mean1).pow(2)) / (
                2.0 * std1**2) - 0.5
            return kl.sum(1, keepdim=True)

        def get_loss_pi():
            if ac.epsilon < 0:
                loss_pi = (ac.Qj1(torch.cat((obs, ac.pi(obs)), dim=1))).mean()
            else:
                # Surrogate objective that matches the gradient of the barrier at \pi=\pi_B
                if (beta / ac.epsilon) - beta_thres > 0:
                    loss_pi =  - (ac.Qv1(torch.cat((obs, ac.pi(obs)),dim=1))).mean() + \
                                (beta/ac.epsilon)*ac.Qj1(torch.cat((obs, ac.pi(obs)),dim=1)).mean()
                else:
                    loss_pi = -(ac.Qv1(torch.cat(
                        (obs, ac.pi(obs)), dim=1))).mean()

            return loss_pi

        old_mean = ac.pi(obs).detach().data
        loss_pi = trust_region_step(ac.pi, get_loss_pi, get_kl, target_l2, 0.1)

        if ac.epsilon >= 0:
            alpha_mix = safe_transform(
                data, ac.baseline_pi, ac.pi, ac.epsilon,
                np.sqrt(
                    np.max((target_l2 + 0.5) * (2.0 * 0.05**2) - 0.05**2, 0)))
            logger.store(AlphaMix=alpha_mix)
            if (beta / ac.epsilon) - beta_thres > 0:
                logger.store(CostGradWeight=(beta / ac.epsilon))
            else:
                logger.store(CostGradWeight=0)
        else:
            logger.store(AlphaMix=-1)
            logger.store(CostGradWeight=-1)
        # Useful extra info
        approx_l2 = torch.sqrt(torch.mean(
            (ac.pi(obs) - data['old_act'])**2)).item()
        approx_kl = get_kl(old_mean=old_mean,
                           new_mean=ac.pi(obs).detach()).mean().item()
        ent = 0
        clipped = [0]
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, l2=approx_l2, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, act, ret = data['obs'], data['act'], data['ret']
        return ((ac.Qv1(torch.cat((obs, act), dim=1)) -
                 ret)**2).mean(), ((ac.Qv2(torch.cat(
                     (obs, act), dim=1)) - ret)**2).mean()

    # Set up function for computing value loss
    def compute_loss_j(data):
        obs, act, cost_ret = data['obs'], data['act'], data['cost_ret']
        return ((ac.Qj1(torch.cat((obs, act), dim=1)) -
                 cost_ret)**2).mean(), ((ac.Qj2(torch.cat(
                     (obs, act), dim=1)) - cost_ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    pi_bc_optimizer = Adam(ac.pi.parameters(), lr=0.001)
    vf1_optimizer = Adam(ac.Qv1.parameters(), lr=vf_lr)
    vf2_optimizer = Adam(ac.Qv2.parameters(), lr=vf_lr)
    jf1_optimizer = Adam(ac.Qj1.parameters(), lr=jf_lr)
    jf2_optimizer = Adam(ac.Qj2.parameters(), lr=jf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(epoch_no, constraint_violations, constraint_violations_count):
        # global soft_penalty, penalty_optimizer
        data = buf.get()

        # Update the penalty
        curr_cost = logger.get_stats('EpCostRet')[0]

        if curr_cost - cost_lim > 0:
            logger.log('Warning! Safety constraint is already violated.',
                       'red')

        ac.epsilon = (1 - gamma) * (cost_lim - curr_cost)
        if epoch_no == 0 or ac.epsilon >= 0:
            ac.baseline_pi = copy.deepcopy(ac.pi)
            ac.baseline_Qj = copy.deepcopy(ac.Qj1)

        pi_l_old, v_l_old, j_l_old = 0, 0, 0
        pi_info_old = dict(kl=0, l2=0, ent=0, cf=0)

        if epoch_no == 0:
            for i in range(train_v_iters):
                vf1_optimizer.zero_grad()
                vf2_optimizer.zero_grad()
                loss_v1, loss_v2 = compute_loss_v(data)
                loss_v1.backward()
                loss_v2.backward()
                mpi_avg_grads(ac.Qv1)  # average grads across MPI processes
                mpi_avg_grads(ac.Qv2)
                vf1_optimizer.step()
                vf2_optimizer.step()

                jf1_optimizer.zero_grad()
                jf2_optimizer.zero_grad()
                loss_j1, loss_j2 = compute_loss_j(data)
                loss_j1.backward()
                loss_j2.backward()
                mpi_avg_grads(ac.Qj1)  # average grads across MPI processes
                mpi_avg_grads(ac.Qj2)
                jf1_optimizer.step()
                jf2_optimizer.step()

        # Trust region update for policy
        loss_pi, pi_info = compute_loss_pi(data, epoch_no=epoch_no)

        logger.store(StopIter=0)

        # Value and Cost Value function learning
        for i in range(train_v_iters):
            vf1_optimizer.zero_grad()
            vf2_optimizer.zero_grad()
            loss_v1, loss_v2 = compute_loss_v(data)
            loss_v1.backward()
            loss_v2.backward()
            mpi_avg_grads(ac.Qv1)  # average grads across MPI processes
            mpi_avg_grads(ac.Qv2)
            vf1_optimizer.step()
            vf2_optimizer.step()

            jf1_optimizer.zero_grad()
            jf2_optimizer.zero_grad()
            loss_j1, loss_j2 = compute_loss_j(data)
            loss_j1.backward()
            loss_j2.backward()
            mpi_avg_grads(ac.Qj1)  # average grads across MPI processes
            mpi_avg_grads(ac.Qj2)
            jf1_optimizer.step()
            jf2_optimizer.step()

        # Log changes from update
        kl, l2, ent, cf = pi_info['kl'], pi_info['l2'], pi_info_old[
            'ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     LossJ=j_l_old,
                     KL=kl,
                     L2=l2,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v1.item() - v_l_old),
                     DeltaLossJ=(loss_j1.item() - j_l_old),
                     Penalty=torch.nn.functional.softplus(soft_penalty))

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_cost_ret, ep_len = env.reset(), 0, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, j, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))
            noise = 0.05 * np.random.randn(*a.shape)  # fixed noise
            a = a + noise
            next_o, r, d, info = env.step(a)
            ep_ret += r
            ep_cost_ret += info.get('cost', 0)
            ep_len += 1

            # save and log
            buf.store(o, a, r, info.get('cost', 0), v, j, logp, a)
            logger.store(VVals=v, JVals=j)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, j, _ = ac.step(
                        torch.as_tensor(o, dtype=torch.float32))
                else:
                    v, j = 0, 0
                buf.finish_path(v, j)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret,
                                 EpCostRet=ep_cost_ret,
                                 EpLen=ep_len)
                o, ep_ret, ep_cost_ret, ep_len = env.reset(), 0, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update(epoch, constraint_violations, constraint_violations_count)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpCostRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('JVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('LossJ', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('DeltaLossJ', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Epsilon', ac.epsilon)
        logger.log_tabular('CostGradWeight', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Penalty', average_only=True)
        logger.log_tabular('AlphaMix', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #8
0
 def log(self, msg, color='green'):
     """Print a colorized message to stdout."""
     if proc_id() == 0:
         print(colorize(msg, color, bold=True))