示例#1
0
    def dump_tabular(self):
        """
        Write all of the diagnostics from the current iteration.

        Writes both to stdout, and to the output file.
        """
        if proc_id() == 0:
            vals = []
            key_lens = [len(key) for key in self.log_headers]
            max_key_len = max(15, max(key_lens))
            keystr = '%' + '%d' % max_key_len
            fmt = "| " + keystr + "s | %15s |"
            n_slashes = 22 + max_key_len
            print("-" * n_slashes)
            for key in self.log_headers:
                val = self.log_current_row.get(key, "")
                valstr = "%8.3g" % val if hasattr(val, "__float__") else val
                print(fmt % (key, valstr))
                vals.append(val)
            print("-" * n_slashes)
            if self.output_file is not None:
                if self.first_row:
                    self.output_file.write("\t".join(self.log_headers) + "\n")
                self.output_file.write("\t".join(map(str, vals)) + "\n")
                self.output_file.flush()
        self.log_current_row.clear()
        self.first_row = False
示例#2
0
    def save_config(self, config):
        """
        Log an experiment configuration.

        Call this once at the top of your experiment, passing in all important
        config vars as a dict. This will serialize the config to JSON, while
        handling anything which can't be serialized in a graceful way (writing
        as informative a string as possible). 

        Example use:

        .. code-block:: python

            logger = EpochLogger(**logger_kwargs)
            logger.save_config(locals())
        """
        config_json = convert_json(config)
        if self.exp_name is not None:
            config_json['exp_name'] = self.exp_name
        if proc_id() == 0:
            output = json.dumps(config_json,
                                separators=(',', ':\t'),
                                indent=4,
                                sort_keys=True)
            print(colorize('Saving config:\n', color='cyan', bold=True))
            print(output)
            with open(osp.join(self.output_dir, "config.json"), 'w') as out:
                out.write(output)
示例#3
0
    def __init__(self,
                 output_dir=None,
                 output_fname='progress.txt',
                 exp_name=None):
        """
        Initialize a Logger.

        Args:
            output_dir (string): A directory for saving results to. If 
                ``None``, defaults to a temp directory of the form
                ``/tmp/experiments/somerandomnumber``.

            output_fname (string): Name for the tab-separated-value file 
                containing metrics logged throughout a training run. 
                Defaults to ``progress.txt``. 

            exp_name (string): Experiment name. If you run multiple training
                runs and give them all the same ``exp_name``, the plotter
                will know to group them. (Use case: if you run the same
                hyperparameter configuration with multiple random seeds, you
                should give them all the same ``exp_name``.)
        """
        if proc_id() == 0:
            self.output_dir = output_dir or "/tmp/experiments/%i" % int(
                time.time())
            if osp.exists(self.output_dir):
                print(
                    "Warning: Log dir %s already exists! Storing info there anyway."
                    % self.output_dir)
            else:
                os.makedirs(self.output_dir)
            self.output_file = open(osp.join(self.output_dir, output_fname),
                                    'w')
            atexit.register(self.output_file.close)
            print(
                colorize("Logging data to %s" % self.output_file.name,
                         'green',
                         bold=True))
        else:
            self.output_dir = None
            self.output_file = None
        self.first_row = True
        self.log_headers = []
        self.log_current_row = {}
        self.exp_name = exp_name
示例#4
0
 def save_state(self, state_dict, model, itr=None):
     """
     Saves the state of an experiment.
     To be clear: this is about saving *state*, not logging diagnostics.
     All diagnostic logging is separate from this function. This function
     will save whatever is in ``state_dict``---usually just a copy of the
     environment---and the most recent copy of the model via ``model``.
     Call with any frequency you prefer. If you only want to maintain a
     single state and overwrite it at each call with the most recent
     version, leave ``itr=None``. If you want to keep all of the states you
     save, provide unique (increasing) values for 'itr'.
     Args:
         state_dict (dict): Dictionary containing essential elements to
             describe the current state of training.
         model (nn.Module): A model which contains the policy.
         itr: An int, or None. Current iteration of training.
     """
     if proc_id() == 0:
         fname = 'vars.pkl' if itr is None else 'vars%d.pkl' % itr
         try:
             joblib.dump(state_dict, osp.join(self.output_dir, fname))
         except:
             self.log('Warning: could not pickle state_dict.', color='red')
         self._torch_save(model, itr)
示例#5
0
def ppo(env_fn,
        actor_critic=model.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        ent_coeff=0.0,
        epochs=50,
        clip=0.2,
        p_lr=3e-4,
        v_lr=1e-3,
        ppo_epochs=80,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization implemented with GAE-lambda advantage function.
    """
    # Loggers
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Update the seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    env = env_fn()

    actor_critic = mlp_actor_critic(env.observation_space.shape,
                                    action_space=env.action_space,
                                    **ac_kwargs)
    rb = ReplayBuffer(local_steps_per_epoch, env.observation_space.shape,
                      env.action_space.shape)

    # Number of parameters
    var_counts = tuple(
        sum(p.numel() for p in module.parameters() if p.requires_grad)
        for module in [actor_critic.p_net, actor_critic.v_net])
    logger.log('Number of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Optimizers
    p_optimizer = torch.optim.Adam(actor_critic.p_net.parameters(), lr=p_lr)
    v_optimizer = torch.optim.Adam(actor_critic.v_net.parameters(), lr=v_lr)
    sync_all_params(actor_critic.parameters())

    # Initializations
    ep_ret, ep_len, r, val, done = 0, 0, 0, 0, False
    start_time = time.time()

    def update():
        actor_critic.train()
        o, logp_old, a, _, rew2g, val, adv = rb.read()
        for i in range(ppo_epochs):
            _, logp, _ = actor_critic.p_net(o, a)
            ratio = (logp - logp_old).exp()  # Ratio of policies
            ent = (-logp).mean()  # Entropy
            kl = (logp_old - logp).mean()

            kl = mpi_avg(kl.item())
            if kl > 1.5 * target_kl:  # Put additional KL-div limit between consequent policies
                logger.log(
                    "Early stopping at step %d due to reaching max KL-divergence"
                    % i)
                break

            p_loss = -torch.min(
                ratio * adv,
                torch.clamp(ratio, 1 - clip, 1 + clip) * adv).mean()
            p_loss = p_loss - ent * ent_coeff
            p_optimizer.zero_grad()
            p_loss.backward()
            average_gradients(p_optimizer.param_groups)
            p_optimizer.step()

            val = actor_critic.v_net(o)
            v_loss = (val - rew2g).pow(2).mean()
            v_optimizer.zero_grad()
            v_loss.backward()
            average_gradients(v_optimizer.param_groups)
            v_optimizer.step()

    # Agent in the Wild
    for epoch in range(epochs):
        obs = env.reset()
        actor_critic.eval()
        for t in range(local_steps_per_epoch):
            #env.render()
            a, _, logp, v = actor_critic(torch.Tensor(obs.reshape(1, -1)))
            rb.write(obs, logp, a, r, v)
            obs, r, done, _ = env.step(a.detach().numpy()[0])

            ep_ret += r
            ep_len += 1

            # Do not lose r,v at terminal states
            if done or t == local_steps_per_epoch - 1:
                v_d = r if done else actor_critic.v_net(
                    torch.Tensor(obs.reshape(1, -1))).item()
                rb.calc_adv(v_d)
                if done:
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                obs, r, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, actor_critic, None)

        update()
        # Logger Monitor
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
    env.close()
示例#6
0
def dqn(env_fn, actor_critic=model.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, 
        epochs=50, p_lr=3e-4, v_lr=1e-3, logger_kwargs=dict(), save_freq=10):
    """
    Deep Q-Network implemented with GAE-lambda advantage function.
    """
    # Loggers
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    
    # Update the seed
    seed += 1000*proc_id()
    torch.manual_seed(seed)

    local_steps_per_epoch = int(steps_per_epoch/num_procs())
    env = env_fn()

    actor_critic = mlp_actor_critic(env.observation_space.shape,  action_space=env.action_space, **ac_kwargs)
    rb = ReplayBuffer(local_steps_per_epoch, env.observation_space.shape, env.action_space.shape)
    
    # Optimizers
    p_optimizer = torch.optim.Adam(actor_critic.p_net.parameters(), lr=p_lr)
    v_optimizer = torch.optim.Adam(actor_critic.v_net.parameters(), lr=v_lr)
    sync_all_params(actor_critic.parameters())

    # Initializations
    ep_ret, ep_len, r, val, done = 0, 0, 0, 0, False
    start_time = time.time()
    
    def update():
        o, logp_old, a, _, rew2g, val, adv = rb.read(on_policy=True)
        for epoch in range(ppo_epochs):
            _, logp, _, val = actor_critic(o,a)
            ratio = (logp - logp_old).exp() # Ratio of policies
            
            kl = (logp_old - logp).mean()
            kl = mpi_avg(kl.item())
            if kl > 1.5 * target_kl: # Put additional KL-div limit between consequent policies
                break
            
            p_loss = -torch.min(ratio * adv, torch.clamp(ratio, 1-clip, 1+clip) * adv).mean()
            p_optimizer.zero_grad()
            p_loss.backward()
            p_optimizer.step()

            v_loss = (val - rew2g).pow(2).mean()
            v_optimizer.zero_grad()
            v_loss.backward()
            v_optimizer.step()

    # Agent in the Wild 
    for epoch in range(epochs):
        obs = env.reset()
        for t in range(local_steps_per_epoch):
            #env.render()
            a, _, logp, v = actor_critic(torch.Tensor(obs))
            rb.write(obs, logp, a, r, v)
            obs, r, done, _ = env.step(a.detach().numpy())

            ep_ret += r
            ep_len += 1
            
            # Do not lose r,v at terminal states
            if done or t==local_steps_per_epoch-1:
                v_d = r if done else actor_critic.v_net(torch.Tensor(obs)).item()
                rb.calc_adv(v_d)
                if done:
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                obs, r, done, ep_ret, ep_len = env.reset(), 0,  False, 0, 0
        
        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, actor_critic, None)

        update()
        # Logger Monitor
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
    env.close()
示例#7
0
 def log(self, msg, color='green'):
     """Print a colorized message to stdout."""
     if proc_id() == 0:
         print(colorize(msg, color, bold=True))
示例#8
0
 def _torch_save(self, model, itr=None):
     if proc_id() == 0:
         fname = 'torch_save.pt' if itr is None else 'torch_save%d.pt' % itr
         torch.save(model, osp.join(self.output_dir, fname))