Exemplo n.º 1
0
 def log_model_weights(self):
     for name, param in self.actor.named_parameters():
         logger.logkv('actor/' + name, param.clone().cpu().data.numpy())
     for name, param in self.actor_target.named_parameters():
         logger.logkv('actor_target/' + name,
                      param.clone().cpu().data.numpy())
     for name, param in self.critic.named_parameters():
         logger.logkv('critic/' + name, param.clone().cpu().data.numpy())
     for name, param in self.critic_target.named_parameters():
         logger.logkv('critic_target/' + name,
                      param.clone().cpu().data.numpy())
Exemplo n.º 2
0
def train(agent, env, N_steps, N_updates, ent_coef, lr,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, batch_size=4, N_train_sample_epochs=4, cliprange=0.2,
            save_interval=0):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)

    runner = Runner(env, agent, nsteps=N_steps, gamma=gamma, lam=lam)
    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    for update in range(1, N_updates+1):

        tstart = time.time()
        obs, returns, dones, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        epinfobuf.extend(epinfos)

        frac = 1.0 - (update - 1.0) / N_updates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        optimazer = optim.Adam(agent.parameters(), lr=lrnow)

        mblossnames = ['policy_loss', 'value_loss', 'entropy', 'approxkl', 'clipfrac']
        mblossvals = []

        N_sample_steps = obs.shape[0]
        inds = np.arange(N_sample_steps)

        agent.train()
        for _ in range(N_train_sample_epochs):
            np.random.shuffle(inds)
            for start in range(0, N_sample_steps, batch_size):

                end = start + batch_size
                mbinds = inds[start:end]
                obs_ = torch.tensor(obs[mbinds], requires_grad=True).float()
                returns_ = torch.tensor(returns[mbinds]).float()
                actions_ = torch.tensor(actions[mbinds]).float()
                values_ = torch.tensor(values[mbinds]).float()
                neglogpacs_ = torch.tensor(neglogpacs[mbinds]).float()

                advs_ = returns_ - values_
                advs_ = (advs_ - advs_.mean()) / (advs_.std() + 1e-8)

                optimazer.zero_grad()
                neglogp, entropy, vpred = agent.statistics(obs_, actions_)
                entropy = torch.mean(entropy)
                vpred_clip = values_ + torch.clamp(vpred - values_, -cliprangenow, cliprangenow)
                vf_loss = torch.max((vpred - returns_) ** 2, (vpred_clip - returns_) ** 2)
                vf_loss = 0.5 * torch.mean(vf_loss)
                ratio = torch.exp(neglogpacs_ - neglogp)
                pg_loss = torch.max(- advs_ * ratio, - advs_ * torch.clamp(ratio, 1.0-cliprangenow, 1.0+cliprangenow))
                pg_loss = torch.mean(pg_loss)
                approxkl = .5 * torch.mean((neglogp - neglogpacs_) ** 2)
                clipfrac = torch.mean((torch.abs(ratio - 1.0) > torch.tensor(cliprangenow)).float())
                loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
                loss.backward()
                optimazer.step()

                mblossvals.append([pg_loss.item(), vf_loss.item(), entropy.item(), approxkl.item(), clipfrac.item()])

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(N_sample_steps / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*N_steps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*N_sample_steps)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, mblossnames):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            checkdir = os.path.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = os.path.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            torch.save(agent.state_dict(), savepath)
    env.close()
    return agent
Exemplo n.º 3
0
 def log_model_weights(self):
     for name, param in self.model.named_parameters():
         logger.logkv(name, param.clone().cpu().data.numpy())
Exemplo n.º 4
0
    def train(self):
        self.net_mode(train=True)
        tfirststart = time.time()
        epoch_episode_rewards = deque(maxlen=1)
        epoch_episode_steps = deque(maxlen=1)
        total_rollout_steps = 0
        for epoch in range(self.global_step, self.num_iters):
            episode_reward = 0
            episode_step = 0
            self.action_noise.reset()
            obs = self.env.reset()
            obs = obs[0]
            epoch_actor_losses = []
            epoch_critic_losses = []
            if self.use_her:
                ep_experi = {
                    'obs': [],
                    'act': [],
                    'reward': [],
                    'new_obs': [],
                    'ach_goals': [],
                    'done': []
                }
            for t_rollout in range(self.rollout_steps):
                total_rollout_steps += 1
                ran = np.random.random(1)[0]
                if self.pretrain_dir is None and epoch < self.warmup_iter or \
                        ran < self.random_prob:
                    act = self.random_action().flatten()
                else:
                    act = self.policy(obs).flatten()
                new_obs, r, done, info = self.env.step(act)
                ach_goals = new_obs[1].copy()
                new_obs = new_obs[0].copy()
                episode_reward += r
                episode_step += 1
                self.memory.append(obs, act, r * self.reward_scale, new_obs,
                                   ach_goals, done)
                if self.use_her:
                    ep_experi['obs'].append(obs)
                    ep_experi['act'].append(act)
                    ep_experi['reward'].append(r * self.reward_scale)
                    ep_experi['new_obs'].append(new_obs)
                    ep_experi['ach_goals'].append(ach_goals)
                    ep_experi['done'].append(done)
                if self.ob_norm:
                    self.obs_oms.update(new_obs)
                obs = new_obs
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            if self.use_her:
                for t in range(episode_step - self.k_future):
                    ob = ep_experi['obs'][t]
                    act = ep_experi['act'][t]
                    new_ob = ep_experi['new_obs'][t]
                    ach_goal = ep_experi['ach_goals'][t]
                    k_futures = np.random.choice(np.arange(
                        t + 1, episode_step),
                                                 self.k_future - 1,
                                                 replace=False)
                    k_futures = np.concatenate((np.array([t]), k_futures))
                    for future in k_futures:
                        new_goal = ep_experi['ach_goals'][future]
                        her_ob = np.concatenate(
                            (ob[:-self.goal_dim], new_goal), axis=0)
                        her_new_ob = np.concatenate(
                            (new_ob[:-self.goal_dim], new_goal), axis=0)
                        res = self.env.cal_reward(ach_goal.copy(), new_goal,
                                                  act)
                        her_reward, _, done = res
                        self.memory.append(her_ob, act,
                                           her_reward * self.reward_scale,
                                           her_new_ob, ach_goal.copy(), done)
            self.global_step += 1
            if epoch >= self.warmup_iter:
                for t_train in range(self.train_steps):
                    act_loss, cri_loss = self.train_net()
                    epoch_critic_losses.append(cri_loss)
                    epoch_actor_losses.append(act_loss)

            if epoch % self.log_interval == 0:
                tnow = time.time()
                stats = {}
                if self.ob_norm:
                    stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy())
                    stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy())
                stats['total_rollout_steps'] = total_rollout_steps
                stats['rollout/return'] = safemean(
                    [rew for rew in epoch_episode_rewards])
                stats['rollout/ep_steps'] = safemean(
                    [l for l in epoch_episode_steps])
                if epoch >= self.warmup_iter:
                    stats['actor_loss'] = np.mean(epoch_actor_losses)
                    stats['critic_loss'] = np.mean(epoch_critic_losses)
                stats['epoch'] = epoch
                stats['actor_lr'] = self.actor_optim.param_groups[0]['lr']
                stats['critic_lr'] = self.critic_optim.param_groups[0]['lr']
                stats['time_elapsed'] = tnow - tfirststart
                for name, value in stats.items():
                    logger.logkv(name, value)
                logger.dumpkvs()
            if (epoch == 0 or epoch >= self.warmup_iter) and \
                    self.save_interval and\
                    epoch % self.save_interval == 0 and \
                    logger.get_dir():
                mean_final_dist, succ_rate = self.rollout()
                logger.logkv('epoch', epoch)
                logger.logkv('test/total_rollout_steps', total_rollout_steps)
                logger.logkv('test/mean_final_dist', mean_final_dist)
                logger.logkv('test/succ_rate', succ_rate)

                tra_mean_dist, tra_succ_rate = self.rollout(train_test=True)
                logger.logkv('train/mean_final_dist', tra_mean_dist)
                logger.logkv('train/succ_rate', tra_succ_rate)

                # self.log_model_weights()
                logger.dumpkvs()
                if mean_final_dist < self.closest_dist:
                    self.closest_dist = mean_final_dist
                    is_best = True
                else:
                    is_best = False
                self.save_model(is_best=is_best, step=self.global_step)
Exemplo n.º 5
0
def train(agent,
          env,
          N_steps,
          N_updates,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          epsilon=1e-5,
          alpha=0.95,
          log_interval=10,
          batch_size=4,
          N_train_sample_epochs=4,
          cliprange=0.2,
          save_interval=0):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)

    runner = Runner(env, agent, nsteps=N_steps, gamma=gamma, lam=lam)
    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    for update in range(1, N_updates + 1):

        obs, returns, dones, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        epinfobuf.extend(epinfos)

        frac = 1.0 - (update - 1.0) / N_updates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        optimazer = optim.RMSprop(agent.parameters(),
                                  lr=lrnow,
                                  weight_decay=alpha,
                                  eps=epsilon)

        mblossnames = ['policy_loss', 'value_loss', 'entropy']
        mblossvals = []

        agent.train()

        obs_ = torch.tensor(obs, requires_grad=True).float()
        returns_ = torch.tensor(returns).float()
        actions_ = torch.tensor(actions).float()
        values_ = torch.tensor(values).float()
        neglogpacs_ = torch.tensor(neglogpacs).float()
        advs_ = returns_ - values_

        optimazer.zero_grad()
        neglogp, entropy, vpred = agent.statistics(obs_, actions_)
        entropy = torch.mean(entropy)
        vf_loss = torch.mean(0.5 * (vpred - returns_)**2)
        pg_loss = torch.mean(advs_ * neglogp)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        loss.backward()
        optimazer.step()

        mblossvals.append([pg_loss.item(), vf_loss.item(), entropy.item()])

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        N_sample_steps = obs.shape[0]
        fps = int(update * N_sample_steps / (tnow - tfirststart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * N_sample_steps)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, mblossnames):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            checkdir = os.path.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = os.path.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            torch.save(agent.state_dict(), savepath)
    env.close()
    return agent
Exemplo n.º 6
0
    def train(self):
        epinfobuf = deque(maxlen=20)
        tfirststart = time.time()

        for update in range(self.num_iters):
            tstart = time.time()
            res = self.runner.run()
            obs, returns, dones, actions, values, acts_neglog, epinfos = res
            if self.ob_rms:
                self.model.ob_rms.update(obs)
            epinfobuf.extend(epinfos)
            lossvals = {
                'policy_loss': [],
                'value_loss': [],
                'policy_entropy': [],
                'approxkl': [],
                'clipfrac': []
            }

            inds = np.arange(self.nbatch)
            for _ in range(self.noptepochs):
                np.random.shuffle(inds)
                for start in range(0, self.nbatch, self.nbatch_train):
                    end = start + self.nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, actions, returns,
                                                      acts_neglog, values))
                    info = self.model.train(*slices)
                    lossvals['policy_loss'].append(info['pg_loss'])
                    lossvals['value_loss'].append(info['vf_loss'])
                    lossvals['policy_entropy'].append(info['entropy'])
                    lossvals['approxkl'].append(info['approxkl'])
                    lossvals['clipfrac'].append(info['clipfrac'])

            tnow = time.time()
            fps = int(self.nbatch / (tnow - tstart))
            if update % self.log_interval == 0:
                ev = explained_variance(values, returns)
                logger.logkv("Learning rate",
                             self.model.optimizer.param_groups[0]['lr'])
                logger.logkv("serial_timesteps", update * self.nsteps)
                logger.logkv("nupdates", update)
                logger.logkv("total_timesteps", update * self.nbatch)
                logger.logkv("fps", fps)
                logger.logkv("explained_variance", float(ev))
                logger.logkv(
                    'eprewmean',
                    safemean([epinfo['reward'] for epinfo in epinfobuf]))
                logger.logkv(
                    'eplenmean',
                    safemean([epinfo['steps'] for epinfo in epinfobuf]))
                logger.logkv('time_elapsed', tnow - tfirststart)
                for name, value in lossvals.items():
                    logger.logkv(name, np.mean(value))
                logger.dumpkvs()
            if self.save_interval and \
                    update % self.save_interval == 0 and \
                    logger.get_dir():
                self.model.log_model_weights()
                avg_steps, avg_reward = self.runner.test()
                logger.logkv("nupdates", update)
                logger.logkv("test/total_timesteps", update * self.nbatch)
                logger.logkv('test/step', avg_steps)
                logger.logkv('test/reward', avg_reward)
                if not self.with_embed:
                    res = self.runner.test(train=True)
                    train_avg_steps, train_avg_reward = res
                    logger.logkv('train/step', train_avg_steps)
                    logger.logkv('train/reward', train_avg_reward)
                logger.dumpkvs()
                if avg_reward > self.model.best_rewards:
                    self.model.best_rewards = avg_reward
                    is_best = True
                else:
                    is_best = False
                self.model.save_model(is_best=is_best, step=update)
        self.env.close()